1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/WinEHFuncInfo.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/ErrorHandling.h"
57 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Support/MathExtras.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
192 if (Subtarget.hasCMov()) {
193 setOperationAction(ISD::ABS , MVT::i16 , Custom);
194 setOperationAction(ISD::ABS , MVT::i32 , Custom);
195 if (Subtarget.is64Bit())
196 setOperationAction(ISD::ABS , MVT::i64 , Custom);
199 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
201 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
205 if (Subtarget.is64Bit()) {
206 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207 // f32/f64 are legal, f80 is custom.
208 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
211 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
212 } else if (!Subtarget.useSoftFloat()) {
213 // We have an algorithm for SSE2->double, and we turn this into a
214 // 64-bit FILD followed by conditional FADD for other targets.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
216 // We have an algorithm for SSE2, and we turn this into a 64-bit
217 // FILD or VCVTUSI2SS/SD for other targets.
218 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
221 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
223 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
224 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
226 if (!Subtarget.useSoftFloat()) {
227 // SSE has no i16 to fp conversion, only i32.
228 if (X86ScalarSSEf32) {
229 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
230 // f32 and f64 cases are Legal, f80 case is not
231 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
233 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
234 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
237 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
238 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
241 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
243 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
244 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
246 if (!Subtarget.useSoftFloat()) {
247 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
248 // are Legal, f80 is custom lowered.
249 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
250 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
252 if (X86ScalarSSEf32) {
253 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
254 // f32 and f64 cases are Legal, f80 case is not
255 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
257 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
258 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
261 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
262 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
263 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
266 // Handle FP_TO_UINT by promoting the destination to a larger signed
268 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
269 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
270 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
272 if (Subtarget.is64Bit()) {
273 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
274 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
275 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
276 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
278 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
279 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
281 } else if (!Subtarget.useSoftFloat()) {
282 // Since AVX is a superset of SSE3, only check for SSE here.
283 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
284 // Expand FP_TO_UINT into a select.
285 // FIXME: We would like to use a Custom expander here eventually to do
286 // the optimal thing for SSE vs. the default expansion in the legalizer.
287 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
289 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
290 // With SSE3 we can use fisttpll to convert to a signed i64; without
291 // SSE, we're stuck with a fistpll.
292 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
294 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
297 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
298 if (!X86ScalarSSEf64) {
299 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
300 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
301 if (Subtarget.is64Bit()) {
302 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
303 // Without SSE, i64->f64 goes through memory.
304 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
306 } else if (!Subtarget.is64Bit())
307 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
309 // Scalar integer divide and remainder are lowered to use operations that
310 // produce two results, to match the available instructions. This exposes
311 // the two-result form to trivial CSE, which is able to combine x/y and x%y
312 // into a single instruction.
314 // Scalar integer multiply-high is also lowered to use two-result
315 // operations, to match the available instructions. However, plain multiply
316 // (low) operations are left as Legal, as there are single-result
317 // instructions for this in x86. Using the two-result multiply instructions
318 // when both high and low results are needed must be arranged by dagcombine.
319 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
320 setOperationAction(ISD::MULHS, VT, Expand);
321 setOperationAction(ISD::MULHU, VT, Expand);
322 setOperationAction(ISD::SDIV, VT, Expand);
323 setOperationAction(ISD::UDIV, VT, Expand);
324 setOperationAction(ISD::SREM, VT, Expand);
325 setOperationAction(ISD::UREM, VT, Expand);
328 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
329 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
330 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
331 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
332 setOperationAction(ISD::BR_CC, VT, Expand);
333 setOperationAction(ISD::SELECT_CC, VT, Expand);
335 if (Subtarget.is64Bit())
336 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
337 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
340 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
342 setOperationAction(ISD::FREM , MVT::f32 , Expand);
343 setOperationAction(ISD::FREM , MVT::f64 , Expand);
344 setOperationAction(ISD::FREM , MVT::f80 , Expand);
345 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
347 // Promote the i8 variants and force them on up to i32 which has a shorter
349 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
350 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
351 if (!Subtarget.hasBMI()) {
352 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
353 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
354 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
355 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
356 if (Subtarget.is64Bit()) {
357 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
358 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
362 if (Subtarget.hasLZCNT()) {
363 // When promoting the i8 variants, force them to i32 for a shorter
365 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
366 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
368 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
369 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
370 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
371 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
372 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
374 if (Subtarget.is64Bit()) {
375 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
376 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
380 // Special handling for half-precision floating point conversions.
381 // If we don't have F16C support, then lower half float conversions
382 // into library calls.
383 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
384 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
385 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
388 // There's never any support for operations beyond MVT::f32.
389 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
390 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
391 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
392 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
394 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
395 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
396 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
397 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
398 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
401 if (Subtarget.hasPOPCNT()) {
402 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
404 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
405 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
406 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
407 if (Subtarget.is64Bit())
408 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
411 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
413 if (!Subtarget.hasMOVBE())
414 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
416 // These should be promoted to a larger select which is supported.
417 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
418 // X86 wants to expand cmov itself.
419 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
420 setOperationAction(ISD::SELECT, VT, Custom);
421 setOperationAction(ISD::SETCC, VT, Custom);
423 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
424 if (VT == MVT::i64 && !Subtarget.is64Bit())
426 setOperationAction(ISD::SELECT, VT, Custom);
427 setOperationAction(ISD::SETCC, VT, Custom);
430 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
431 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
432 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
434 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
435 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
436 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
437 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
438 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
439 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
440 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
441 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
444 for (auto VT : { MVT::i32, MVT::i64 }) {
445 if (VT == MVT::i64 && !Subtarget.is64Bit())
447 setOperationAction(ISD::ConstantPool , VT, Custom);
448 setOperationAction(ISD::JumpTable , VT, Custom);
449 setOperationAction(ISD::GlobalAddress , VT, Custom);
450 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
451 setOperationAction(ISD::ExternalSymbol , VT, Custom);
452 setOperationAction(ISD::BlockAddress , VT, Custom);
455 // 64-bit shl, sra, srl (iff 32-bit x86)
456 for (auto VT : { MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
459 setOperationAction(ISD::SHL_PARTS, VT, Custom);
460 setOperationAction(ISD::SRA_PARTS, VT, Custom);
461 setOperationAction(ISD::SRL_PARTS, VT, Custom);
464 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
465 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
467 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
469 // Expand certain atomics
470 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
471 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
472 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
473 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
477 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
480 if (Subtarget.hasCmpxchg16b()) {
481 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
484 // FIXME - use subtarget debug flags
485 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
486 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
487 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
488 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
491 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
492 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
494 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
495 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
497 setOperationAction(ISD::TRAP, MVT::Other, Legal);
498 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
500 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
501 setOperationAction(ISD::VASTART , MVT::Other, Custom);
502 setOperationAction(ISD::VAEND , MVT::Other, Expand);
503 bool Is64Bit = Subtarget.is64Bit();
504 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
505 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
507 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
508 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
510 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
512 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
513 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
514 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
516 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
517 // f32 and f64 use SSE.
518 // Set up the FP register classes.
519 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
520 : &X86::FR32RegClass);
521 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
522 : &X86::FR64RegClass);
524 for (auto VT : { MVT::f32, MVT::f64 }) {
525 // Use ANDPD to simulate FABS.
526 setOperationAction(ISD::FABS, VT, Custom);
528 // Use XORP to simulate FNEG.
529 setOperationAction(ISD::FNEG, VT, Custom);
531 // Use ANDPD and ORPD to simulate FCOPYSIGN.
532 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
534 // We don't support sin/cos/fmod
535 setOperationAction(ISD::FSIN , VT, Expand);
536 setOperationAction(ISD::FCOS , VT, Expand);
537 setOperationAction(ISD::FSINCOS, VT, Expand);
540 // Lower this to MOVMSK plus an AND.
541 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
542 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
544 // Expand FP immediates into loads from the stack, except for the special
546 addLegalFPImmediate(APFloat(+0.0)); // xorpd
547 addLegalFPImmediate(APFloat(+0.0f)); // xorps
548 } else if (UseX87 && X86ScalarSSEf32) {
549 // Use SSE for f32, x87 for f64.
550 // Set up the FP register classes.
551 addRegisterClass(MVT::f32, &X86::FR32RegClass);
552 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
554 // Use ANDPS to simulate FABS.
555 setOperationAction(ISD::FABS , MVT::f32, Custom);
557 // Use XORP to simulate FNEG.
558 setOperationAction(ISD::FNEG , MVT::f32, Custom);
560 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
562 // Use ANDPS and ORPS to simulate FCOPYSIGN.
563 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
564 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
566 // We don't support sin/cos/fmod
567 setOperationAction(ISD::FSIN , MVT::f32, Expand);
568 setOperationAction(ISD::FCOS , MVT::f32, Expand);
569 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
571 // Special cases we handle for FP constants.
572 addLegalFPImmediate(APFloat(+0.0f)); // xorps
573 addLegalFPImmediate(APFloat(+0.0)); // FLD0
574 addLegalFPImmediate(APFloat(+1.0)); // FLD1
575 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
576 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
578 // Always expand sin/cos functions even though x87 has an instruction.
579 setOperationAction(ISD::FSIN , MVT::f64, Expand);
580 setOperationAction(ISD::FCOS , MVT::f64, Expand);
581 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
583 // f32 and f64 in x87.
584 // Set up the FP register classes.
585 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
586 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
588 for (auto VT : { MVT::f32, MVT::f64 }) {
589 setOperationAction(ISD::UNDEF, VT, Expand);
590 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
592 // Always expand sin/cos functions even though x87 has an instruction.
593 setOperationAction(ISD::FSIN , VT, Expand);
594 setOperationAction(ISD::FCOS , VT, Expand);
595 setOperationAction(ISD::FSINCOS, VT, Expand);
597 addLegalFPImmediate(APFloat(+0.0)); // FLD0
598 addLegalFPImmediate(APFloat(+1.0)); // FLD1
599 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
600 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
601 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
602 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
603 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
604 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
607 // We don't support FMA.
608 setOperationAction(ISD::FMA, MVT::f64, Expand);
609 setOperationAction(ISD::FMA, MVT::f32, Expand);
611 // Long double always uses X87, except f128 in MMX.
613 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
614 addRegisterClass(MVT::f128, &X86::FR128RegClass);
615 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
616 setOperationAction(ISD::FABS , MVT::f128, Custom);
617 setOperationAction(ISD::FNEG , MVT::f128, Custom);
618 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
621 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
622 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
623 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
625 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
626 addLegalFPImmediate(TmpFlt); // FLD0
628 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
631 APFloat TmpFlt2(+1.0);
632 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
634 addLegalFPImmediate(TmpFlt2); // FLD1
635 TmpFlt2.changeSign();
636 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
639 // Always expand sin/cos functions even though x87 has an instruction.
640 setOperationAction(ISD::FSIN , MVT::f80, Expand);
641 setOperationAction(ISD::FCOS , MVT::f80, Expand);
642 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
644 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
645 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
646 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
647 setOperationAction(ISD::FRINT, MVT::f80, Expand);
648 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
649 setOperationAction(ISD::FMA, MVT::f80, Expand);
652 // Always use a library call for pow.
653 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
654 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
655 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
657 setOperationAction(ISD::FLOG, MVT::f80, Expand);
658 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
659 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
660 setOperationAction(ISD::FEXP, MVT::f80, Expand);
661 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
662 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
663 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
665 // Some FP actions are always expanded for vector types.
666 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
667 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
668 setOperationAction(ISD::FSIN, VT, Expand);
669 setOperationAction(ISD::FSINCOS, VT, Expand);
670 setOperationAction(ISD::FCOS, VT, Expand);
671 setOperationAction(ISD::FREM, VT, Expand);
672 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
673 setOperationAction(ISD::FPOW, VT, Expand);
674 setOperationAction(ISD::FLOG, VT, Expand);
675 setOperationAction(ISD::FLOG2, VT, Expand);
676 setOperationAction(ISD::FLOG10, VT, Expand);
677 setOperationAction(ISD::FEXP, VT, Expand);
678 setOperationAction(ISD::FEXP2, VT, Expand);
681 // First set operation action for all vector types to either promote
682 // (for widening) or expand (for scalarization). Then we will selectively
683 // turn on ones that can be effectively codegen'd.
684 for (MVT VT : MVT::vector_valuetypes()) {
685 setOperationAction(ISD::SDIV, VT, Expand);
686 setOperationAction(ISD::UDIV, VT, Expand);
687 setOperationAction(ISD::SREM, VT, Expand);
688 setOperationAction(ISD::UREM, VT, Expand);
689 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
690 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
691 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
692 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
693 setOperationAction(ISD::FMA, VT, Expand);
694 setOperationAction(ISD::FFLOOR, VT, Expand);
695 setOperationAction(ISD::FCEIL, VT, Expand);
696 setOperationAction(ISD::FTRUNC, VT, Expand);
697 setOperationAction(ISD::FRINT, VT, Expand);
698 setOperationAction(ISD::FNEARBYINT, VT, Expand);
699 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
700 setOperationAction(ISD::MULHS, VT, Expand);
701 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHU, VT, Expand);
703 setOperationAction(ISD::SDIVREM, VT, Expand);
704 setOperationAction(ISD::UDIVREM, VT, Expand);
705 setOperationAction(ISD::CTPOP, VT, Expand);
706 setOperationAction(ISD::CTTZ, VT, Expand);
707 setOperationAction(ISD::CTLZ, VT, Expand);
708 setOperationAction(ISD::ROTL, VT, Expand);
709 setOperationAction(ISD::ROTR, VT, Expand);
710 setOperationAction(ISD::BSWAP, VT, Expand);
711 setOperationAction(ISD::SETCC, VT, Expand);
712 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
713 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
714 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
715 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
716 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
717 setOperationAction(ISD::TRUNCATE, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
719 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
720 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
721 setOperationAction(ISD::SELECT_CC, VT, Expand);
722 for (MVT InnerVT : MVT::vector_valuetypes()) {
723 setTruncStoreAction(InnerVT, VT, Expand);
725 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
726 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
728 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
729 // types, we have to deal with them whether we ask for Expansion or not.
730 // Setting Expand causes its own optimisation problems though, so leave
732 if (VT.getVectorElementType() == MVT::i1)
733 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
735 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
736 // split/scalarized right now.
737 if (VT.getVectorElementType() == MVT::f16)
738 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
742 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
743 // with -msoft-float, disable use of MMX as well.
744 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
745 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
746 // No operations on x86mmx supported, everything uses intrinsics.
749 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
750 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
751 : &X86::VR128RegClass);
753 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
754 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
755 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
756 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
757 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
758 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
759 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
760 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
764 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
765 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
766 : &X86::VR128RegClass);
768 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
769 // registers cannot be used even for integer operations.
770 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
771 : &X86::VR128RegClass);
772 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
779 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
780 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
781 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
782 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
783 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
784 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
785 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
786 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
787 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
788 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
789 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
790 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
791 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
793 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
794 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
795 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
796 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
798 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
799 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
800 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
802 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
803 setOperationAction(ISD::SETCC, VT, Custom);
804 setOperationAction(ISD::CTPOP, VT, Custom);
805 setOperationAction(ISD::CTTZ, VT, Custom);
808 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
809 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
810 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
811 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
812 setOperationAction(ISD::VSELECT, VT, Custom);
813 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
816 // We support custom legalizing of sext and anyext loads for specific
817 // memory vector types which we can load as a scalar (or sequence of
818 // scalars) and extend in-register to a legal 128-bit vector type. For sext
819 // loads these must work with a single scalar load.
820 for (MVT VT : MVT::integer_vector_valuetypes()) {
821 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
822 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
823 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
824 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
825 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
826 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
827 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
828 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
829 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
832 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
833 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
834 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
835 setOperationAction(ISD::VSELECT, VT, Custom);
837 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
840 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
841 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
844 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
845 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
846 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
847 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
848 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
849 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
850 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
853 // Custom lower v2i64 and v2f64 selects.
854 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
855 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
857 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
858 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
860 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
861 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
863 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
865 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
866 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
868 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
869 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
871 for (MVT VT : MVT::fp_vector_valuetypes())
872 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
874 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
875 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
876 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
878 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
879 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
880 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
882 // In the customized shift lowering, the legal v4i32/v2i64 cases
883 // in AVX2 will be recognized.
884 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
885 setOperationAction(ISD::SRL, VT, Custom);
886 setOperationAction(ISD::SHL, VT, Custom);
887 setOperationAction(ISD::SRA, VT, Custom);
891 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
892 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
893 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
894 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
895 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
896 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
897 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
898 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
899 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
902 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
903 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
904 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
905 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
906 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
907 setOperationAction(ISD::FRINT, RoundedTy, Legal);
908 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
911 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
912 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
913 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
914 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
915 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
916 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
917 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
918 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
920 // FIXME: Do we need to handle scalar-to-vector here?
921 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
923 // We directly match byte blends in the backend as they match the VSELECT
925 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
927 // SSE41 brings specific instructions for doing vector sign extend even in
928 // cases where we don't have SRA.
929 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
930 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
931 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
934 for (MVT VT : MVT::integer_vector_valuetypes()) {
935 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
936 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
937 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
940 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
941 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
942 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
943 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
944 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
945 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
946 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
947 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
948 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
951 // i8 vectors are custom because the source register and source
952 // source memory operand types are not the same width.
953 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
956 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
957 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
958 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
959 setOperationAction(ISD::ROTL, VT, Custom);
961 // XOP can efficiently perform BITREVERSE with VPPERM.
962 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
963 setOperationAction(ISD::BITREVERSE, VT, Custom);
965 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
966 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
967 setOperationAction(ISD::BITREVERSE, VT, Custom);
970 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
971 bool HasInt256 = Subtarget.hasInt256();
973 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
974 : &X86::VR256RegClass);
975 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
976 : &X86::VR256RegClass);
977 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
978 : &X86::VR256RegClass);
979 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
980 : &X86::VR256RegClass);
981 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
982 : &X86::VR256RegClass);
983 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
984 : &X86::VR256RegClass);
986 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
987 setOperationAction(ISD::FFLOOR, VT, Legal);
988 setOperationAction(ISD::FCEIL, VT, Legal);
989 setOperationAction(ISD::FTRUNC, VT, Legal);
990 setOperationAction(ISD::FRINT, VT, Legal);
991 setOperationAction(ISD::FNEARBYINT, VT, Legal);
992 setOperationAction(ISD::FNEG, VT, Custom);
993 setOperationAction(ISD::FABS, VT, Custom);
994 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
997 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
998 // even though v8i16 is a legal type.
999 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1000 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1001 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1003 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1004 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1006 for (MVT VT : MVT::fp_vector_valuetypes())
1007 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1009 // In the customized shift lowering, the legal v8i32/v4i64 cases
1010 // in AVX2 will be recognized.
1011 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1012 setOperationAction(ISD::SRL, VT, Custom);
1013 setOperationAction(ISD::SHL, VT, Custom);
1014 setOperationAction(ISD::SRA, VT, Custom);
1017 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1018 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1021 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1022 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1023 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1024 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1027 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1028 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1029 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1030 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1032 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1033 setOperationAction(ISD::SETCC, VT, Custom);
1034 setOperationAction(ISD::CTPOP, VT, Custom);
1035 setOperationAction(ISD::CTTZ, VT, Custom);
1036 setOperationAction(ISD::CTLZ, VT, Custom);
1039 if (Subtarget.hasAnyFMA()) {
1040 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1041 MVT::v2f64, MVT::v4f64 })
1042 setOperationAction(ISD::FMA, VT, Legal);
1045 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1046 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1047 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1050 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1051 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1052 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1053 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1055 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1056 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1058 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1059 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1060 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1061 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1063 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1064 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1065 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1066 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1067 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1068 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1072 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1073 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1074 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1076 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1077 // when we have a 256bit-wide blend with immediate.
1078 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1080 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1081 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1082 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1083 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1084 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1085 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1086 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1087 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1091 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1092 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1093 setOperationAction(ISD::MLOAD, VT, Legal);
1094 setOperationAction(ISD::MSTORE, VT, Legal);
1097 // Extract subvector is special because the value type
1098 // (result) is 128-bit but the source is 256-bit wide.
1099 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1100 MVT::v4f32, MVT::v2f64 }) {
1101 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1104 // Custom lower several nodes for 256-bit types.
1105 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1106 MVT::v8f32, MVT::v4f64 }) {
1107 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1108 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1109 setOperationAction(ISD::VSELECT, VT, Custom);
1110 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1111 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1112 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1113 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1114 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1118 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1120 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1121 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1122 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1123 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1124 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1125 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1126 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1130 // Custom legalize 2x32 to get a little better code.
1131 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1132 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1134 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1135 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1136 setOperationAction(ISD::MGATHER, VT, Custom);
1140 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1141 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1142 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1143 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1144 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1146 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1147 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1148 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1150 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1151 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1152 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1154 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i1, MVT::v16i32);
1155 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i1, MVT::v16i32);
1156 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i1, MVT::v8i32);
1157 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i1, MVT::v8i32);
1158 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i1, MVT::v4i32);
1159 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i1, MVT::v4i32);
1160 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1161 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1163 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
1164 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
1165 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1166 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1167 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1168 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1169 if (Subtarget.hasVLX()) {
1170 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1171 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1174 // Extends of v16i1/v8i1 to 128-bit vectors.
1175 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1176 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom);
1177 setOperationAction(ISD::ANY_EXTEND, MVT::v16i8, Custom);
1178 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1179 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom);
1180 setOperationAction(ISD::ANY_EXTEND, MVT::v8i16, Custom);
1182 for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1183 setOperationAction(ISD::ADD, VT, Custom);
1184 setOperationAction(ISD::SUB, VT, Custom);
1185 setOperationAction(ISD::MUL, VT, Custom);
1186 setOperationAction(ISD::SETCC, VT, Custom);
1187 setOperationAction(ISD::SELECT, VT, Custom);
1188 setOperationAction(ISD::TRUNCATE, VT, Custom);
1190 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1191 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1192 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1193 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1194 setOperationAction(ISD::VSELECT, VT, Expand);
1197 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1198 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1199 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1200 for (auto VT : { MVT::v1i1, MVT::v8i1 })
1201 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1203 for (MVT VT : MVT::fp_vector_valuetypes())
1204 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1206 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1207 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1208 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1209 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1210 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1211 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1214 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1215 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1216 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1217 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1218 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1219 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1220 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1221 setTruncStoreAction(VT, MaskVT, Custom);
1224 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1225 setOperationAction(ISD::FNEG, VT, Custom);
1226 setOperationAction(ISD::FABS, VT, Custom);
1227 setOperationAction(ISD::FMA, VT, Legal);
1228 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1231 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1232 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
1233 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
1234 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1235 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
1236 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
1237 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1238 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1240 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1241 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1242 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1243 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1244 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1246 if (!Subtarget.hasVLX()) {
1247 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1248 // to 512-bit rather than use the AVX2 instructions so that we can use
1250 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1251 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1252 setOperationAction(ISD::MLOAD, VT, Custom);
1253 setOperationAction(ISD::MSTORE, VT, Custom);
1257 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1258 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1259 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1260 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1261 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1262 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1263 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1264 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1266 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1267 setOperationAction(ISD::FFLOOR, VT, Legal);
1268 setOperationAction(ISD::FCEIL, VT, Legal);
1269 setOperationAction(ISD::FTRUNC, VT, Legal);
1270 setOperationAction(ISD::FRINT, VT, Legal);
1271 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1274 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1275 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1277 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1278 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1279 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1281 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1282 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1283 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1284 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1286 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1287 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1289 setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
1290 setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
1292 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1293 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1294 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1296 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1297 setOperationAction(ISD::SMAX, VT, Legal);
1298 setOperationAction(ISD::UMAX, VT, Legal);
1299 setOperationAction(ISD::SMIN, VT, Legal);
1300 setOperationAction(ISD::UMIN, VT, Legal);
1301 setOperationAction(ISD::ABS, VT, Legal);
1302 setOperationAction(ISD::SRL, VT, Custom);
1303 setOperationAction(ISD::SHL, VT, Custom);
1304 setOperationAction(ISD::SRA, VT, Custom);
1305 setOperationAction(ISD::CTPOP, VT, Custom);
1306 setOperationAction(ISD::CTTZ, VT, Custom);
1307 setOperationAction(ISD::ROTL, VT, Custom);
1308 setOperationAction(ISD::ROTR, VT, Custom);
1311 // Need to promote to 64-bit even though we have 32-bit masked instructions
1312 // because the IR optimizers rearrange bitcasts around logic ops leaving
1313 // too many variations to handle if we don't promote them.
1314 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1315 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1316 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1318 if (Subtarget.hasDQI()) {
1319 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1320 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1321 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1322 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1325 if (Subtarget.hasCDI()) {
1326 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1327 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1328 setOperationAction(ISD::CTLZ, VT, Legal);
1329 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1331 } // Subtarget.hasCDI()
1333 if (Subtarget.hasVPOPCNTDQ()) {
1334 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1335 setOperationAction(ISD::CTPOP, VT, Legal);
1338 // Extract subvector is special because the value type
1339 // (result) is 256-bit but the source is 512-bit wide.
1340 // 128-bit was made Legal under AVX1.
1341 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1342 MVT::v8f32, MVT::v4f64 })
1343 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1345 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1346 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1347 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1348 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1349 setOperationAction(ISD::VSELECT, VT, Custom);
1350 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1351 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1352 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1353 setOperationAction(ISD::MLOAD, VT, Legal);
1354 setOperationAction(ISD::MSTORE, VT, Legal);
1355 setOperationAction(ISD::MGATHER, VT, Custom);
1356 setOperationAction(ISD::MSCATTER, VT, Custom);
1358 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1359 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1360 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1364 if (!Subtarget.useSoftFloat() &&
1365 (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
1366 // These operations are handled on non-VLX by artificially widening in
1368 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1370 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1371 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1372 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1373 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1374 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1376 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1377 setOperationAction(ISD::SMAX, VT, Legal);
1378 setOperationAction(ISD::UMAX, VT, Legal);
1379 setOperationAction(ISD::SMIN, VT, Legal);
1380 setOperationAction(ISD::UMIN, VT, Legal);
1381 setOperationAction(ISD::ABS, VT, Legal);
1384 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1385 setOperationAction(ISD::ROTL, VT, Custom);
1386 setOperationAction(ISD::ROTR, VT, Custom);
1389 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1390 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1391 setOperationAction(ISD::MSCATTER, VT, Custom);
1393 if (Subtarget.hasDQI()) {
1394 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1395 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1396 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1397 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1398 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1402 if (Subtarget.hasCDI()) {
1403 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1404 setOperationAction(ISD::CTLZ, VT, Legal);
1405 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1407 } // Subtarget.hasCDI()
1409 if (Subtarget.hasVPOPCNTDQ()) {
1410 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1411 setOperationAction(ISD::CTPOP, VT, Legal);
1415 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1416 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1417 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1419 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1420 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1422 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1423 setOperationAction(ISD::ADD, VT, Custom);
1424 setOperationAction(ISD::SUB, VT, Custom);
1425 setOperationAction(ISD::MUL, VT, Custom);
1426 setOperationAction(ISD::VSELECT, VT, Expand);
1428 setOperationAction(ISD::TRUNCATE, VT, Custom);
1429 setOperationAction(ISD::SETCC, VT, Custom);
1430 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1431 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1432 setOperationAction(ISD::SELECT, VT, Custom);
1433 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1434 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1437 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1438 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1439 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1440 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1441 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1442 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1444 // Extends from v32i1 masks to 256-bit vectors.
1445 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1446 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1447 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1448 // Extends from v64i1 masks to 512-bit vectors.
1449 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1450 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1451 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1453 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1454 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1455 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1456 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1457 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1458 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1459 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1460 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1461 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1462 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1463 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1464 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1465 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1466 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1467 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1468 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1469 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1470 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1471 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1472 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1473 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1474 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1475 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1477 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1479 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1481 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1482 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1483 setOperationAction(ISD::VSELECT, VT, Custom);
1484 setOperationAction(ISD::ABS, VT, Legal);
1485 setOperationAction(ISD::SRL, VT, Custom);
1486 setOperationAction(ISD::SHL, VT, Custom);
1487 setOperationAction(ISD::SRA, VT, Custom);
1488 setOperationAction(ISD::MLOAD, VT, Legal);
1489 setOperationAction(ISD::MSTORE, VT, Legal);
1490 setOperationAction(ISD::CTPOP, VT, Custom);
1491 setOperationAction(ISD::CTTZ, VT, Custom);
1492 setOperationAction(ISD::CTLZ, VT, Custom);
1493 setOperationAction(ISD::SMAX, VT, Legal);
1494 setOperationAction(ISD::UMAX, VT, Legal);
1495 setOperationAction(ISD::SMIN, VT, Legal);
1496 setOperationAction(ISD::UMIN, VT, Legal);
1498 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1499 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1500 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1503 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1504 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1507 if (Subtarget.hasBITALG()) {
1508 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1509 setOperationAction(ISD::CTPOP, VT, Legal);
1513 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() &&
1514 (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
1515 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1516 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1517 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1520 // These operations are handled on non-VLX by artificially widening in
1522 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1524 if (Subtarget.hasBITALG()) {
1525 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1526 setOperationAction(ISD::CTPOP, VT, Legal);
1530 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1531 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1532 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1534 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1535 setOperationAction(ISD::ADD, VT, Custom);
1536 setOperationAction(ISD::SUB, VT, Custom);
1537 setOperationAction(ISD::MUL, VT, Custom);
1538 setOperationAction(ISD::VSELECT, VT, Expand);
1540 setOperationAction(ISD::TRUNCATE, VT, Custom);
1541 setOperationAction(ISD::SETCC, VT, Custom);
1542 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1543 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1544 setOperationAction(ISD::SELECT, VT, Custom);
1545 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1546 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1549 // TODO: v8i1 concat should be legal without VLX to support concats of
1550 // v1i1, but we won't legalize it correctly currently without introducing
1551 // a v4i1 concat in the middle.
1552 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1553 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1554 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1555 for (auto VT : { MVT::v2i1, MVT::v4i1 })
1556 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1558 // Extends from v2i1/v4i1 masks to 128-bit vectors.
1559 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1560 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1561 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1562 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1563 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Custom);
1564 setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Custom);
1566 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1567 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1568 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1569 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1570 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1572 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1573 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1574 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1575 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1576 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1578 if (Subtarget.hasDQI()) {
1579 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1580 // v2f32 UINT_TO_FP is already custom under SSE2.
1581 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1582 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1583 "Unexpected operation action!");
1584 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1585 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1586 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1589 if (Subtarget.hasBWI()) {
1590 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1591 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1595 // We want to custom lower some of our intrinsics.
1596 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1597 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1598 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1599 if (!Subtarget.is64Bit()) {
1600 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1601 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1604 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1605 // handle type legalization for these operations here.
1607 // FIXME: We really should do custom legalization for addition and
1608 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1609 // than generic legalization for 64-bit multiplication-with-overflow, though.
1610 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1611 if (VT == MVT::i64 && !Subtarget.is64Bit())
1613 // Add/Sub/Mul with overflow operations are custom lowered.
1614 setOperationAction(ISD::SADDO, VT, Custom);
1615 setOperationAction(ISD::UADDO, VT, Custom);
1616 setOperationAction(ISD::SSUBO, VT, Custom);
1617 setOperationAction(ISD::USUBO, VT, Custom);
1618 setOperationAction(ISD::SMULO, VT, Custom);
1619 setOperationAction(ISD::UMULO, VT, Custom);
1621 // Support carry in as value rather than glue.
1622 setOperationAction(ISD::ADDCARRY, VT, Custom);
1623 setOperationAction(ISD::SUBCARRY, VT, Custom);
1624 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1627 if (!Subtarget.is64Bit()) {
1628 // These libcalls are not available in 32-bit.
1629 setLibcallName(RTLIB::SHL_I128, nullptr);
1630 setLibcallName(RTLIB::SRL_I128, nullptr);
1631 setLibcallName(RTLIB::SRA_I128, nullptr);
1632 setLibcallName(RTLIB::MUL_I128, nullptr);
1635 // Combine sin / cos into _sincos_stret if it is available.
1636 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1637 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1638 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1639 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1642 if (Subtarget.isTargetWin64()) {
1643 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1644 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1645 setOperationAction(ISD::SREM, MVT::i128, Custom);
1646 setOperationAction(ISD::UREM, MVT::i128, Custom);
1647 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1648 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1651 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1652 // is. We should promote the value to 64-bits to solve this.
1653 // This is what the CRT headers do - `fmodf` is an inline header
1654 // function casting to f64 and calling `fmod`.
1655 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1656 Subtarget.isTargetWindowsItanium()))
1657 for (ISD::NodeType Op :
1658 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1659 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1660 if (isOperationExpand(Op, MVT::f32))
1661 setOperationAction(Op, MVT::f32, Promote);
1663 // We have target-specific dag combine patterns for the following nodes:
1664 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1665 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1666 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1667 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1668 setTargetDAGCombine(ISD::BITCAST);
1669 setTargetDAGCombine(ISD::VSELECT);
1670 setTargetDAGCombine(ISD::SELECT);
1671 setTargetDAGCombine(ISD::SHL);
1672 setTargetDAGCombine(ISD::SRA);
1673 setTargetDAGCombine(ISD::SRL);
1674 setTargetDAGCombine(ISD::OR);
1675 setTargetDAGCombine(ISD::AND);
1676 setTargetDAGCombine(ISD::ADD);
1677 setTargetDAGCombine(ISD::FADD);
1678 setTargetDAGCombine(ISD::FSUB);
1679 setTargetDAGCombine(ISD::FNEG);
1680 setTargetDAGCombine(ISD::FMA);
1681 setTargetDAGCombine(ISD::FMINNUM);
1682 setTargetDAGCombine(ISD::FMAXNUM);
1683 setTargetDAGCombine(ISD::SUB);
1684 setTargetDAGCombine(ISD::LOAD);
1685 setTargetDAGCombine(ISD::MLOAD);
1686 setTargetDAGCombine(ISD::STORE);
1687 setTargetDAGCombine(ISD::MSTORE);
1688 setTargetDAGCombine(ISD::TRUNCATE);
1689 setTargetDAGCombine(ISD::ZERO_EXTEND);
1690 setTargetDAGCombine(ISD::ANY_EXTEND);
1691 setTargetDAGCombine(ISD::SIGN_EXTEND);
1692 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1693 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1694 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1695 setTargetDAGCombine(ISD::SINT_TO_FP);
1696 setTargetDAGCombine(ISD::UINT_TO_FP);
1697 setTargetDAGCombine(ISD::SETCC);
1698 setTargetDAGCombine(ISD::MUL);
1699 setTargetDAGCombine(ISD::XOR);
1700 setTargetDAGCombine(ISD::MSCATTER);
1701 setTargetDAGCombine(ISD::MGATHER);
1703 computeRegisterProperties(Subtarget.getRegisterInfo());
1705 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1706 MaxStoresPerMemsetOptSize = 8;
1707 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1708 MaxStoresPerMemcpyOptSize = 4;
1709 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1710 MaxStoresPerMemmoveOptSize = 4;
1712 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1713 // that needs to benchmarked and balanced with the potential use of vector
1714 // load/store types (PR33329, PR33914).
1715 MaxLoadsPerMemcmp = 2;
1716 MaxLoadsPerMemcmpOptSize = 2;
1718 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1719 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1721 // An out-of-order CPU can speculatively execute past a predictable branch,
1722 // but a conditional move could be stalled by an expensive earlier operation.
1723 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1724 EnableExtLdPromotion = true;
1725 setPrefFunctionAlignment(4); // 2^4 bytes.
1727 verifyIntrinsicTables();
1730 // This has so far only been implemented for 64-bit MachO.
1731 bool X86TargetLowering::useLoadStackGuardNode() const {
1732 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1735 bool X86TargetLowering::useStackGuardXorFP() const {
1736 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1737 return Subtarget.getTargetTriple().isOSMSVCRT();
1740 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1741 const SDLoc &DL) const {
1742 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1743 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1744 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1745 return SDValue(Node, 0);
1748 TargetLoweringBase::LegalizeTypeAction
1749 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1750 if (ExperimentalVectorWideningLegalization &&
1751 VT.getVectorNumElements() != 1 &&
1752 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1753 return TypeWidenVector;
1755 return TargetLoweringBase::getPreferredVectorAction(VT);
1758 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1759 LLVMContext& Context,
1764 if (Subtarget.hasAVX512()) {
1765 const unsigned NumElts = VT.getVectorNumElements();
1767 // Figure out what this type will be legalized to.
1769 while (getTypeAction(Context, LegalVT) != TypeLegal)
1770 LegalVT = getTypeToTransformTo(Context, LegalVT);
1772 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1773 if (LegalVT.getSimpleVT().is512BitVector())
1774 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1776 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1777 // If we legalized to less than a 512-bit vector, then we will use a vXi1
1778 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1780 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1781 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1782 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1786 return VT.changeVectorElementTypeToInteger();
1789 /// Helper for getByValTypeAlignment to determine
1790 /// the desired ByVal argument alignment.
1791 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1794 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1795 if (VTy->getBitWidth() == 128)
1797 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1798 unsigned EltAlign = 0;
1799 getMaxByValAlign(ATy->getElementType(), EltAlign);
1800 if (EltAlign > MaxAlign)
1801 MaxAlign = EltAlign;
1802 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1803 for (auto *EltTy : STy->elements()) {
1804 unsigned EltAlign = 0;
1805 getMaxByValAlign(EltTy, EltAlign);
1806 if (EltAlign > MaxAlign)
1807 MaxAlign = EltAlign;
1814 /// Return the desired alignment for ByVal aggregate
1815 /// function arguments in the caller parameter area. For X86, aggregates
1816 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1817 /// are at 4-byte boundaries.
1818 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1819 const DataLayout &DL) const {
1820 if (Subtarget.is64Bit()) {
1821 // Max of 8 and alignment of type.
1822 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1829 if (Subtarget.hasSSE1())
1830 getMaxByValAlign(Ty, Align);
1834 /// Returns the target specific optimal type for load
1835 /// and store operations as a result of memset, memcpy, and memmove
1836 /// lowering. If DstAlign is zero that means it's safe to destination
1837 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1838 /// means there isn't a need to check it against alignment requirement,
1839 /// probably because the source does not need to be loaded. If 'IsMemset' is
1840 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1841 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1842 /// source is constant so it does not need to be loaded.
1843 /// It returns EVT::Other if the type should be determined using generic
1844 /// target-independent logic.
1846 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1847 unsigned DstAlign, unsigned SrcAlign,
1848 bool IsMemset, bool ZeroMemset,
1850 MachineFunction &MF) const {
1851 const Function &F = MF.getFunction();
1852 if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
1854 (!Subtarget.isUnalignedMem16Slow() ||
1855 ((DstAlign == 0 || DstAlign >= 16) &&
1856 (SrcAlign == 0 || SrcAlign >= 16)))) {
1857 // FIXME: Check if unaligned 32-byte accesses are slow.
1858 if (Size >= 32 && Subtarget.hasAVX()) {
1859 // Although this isn't a well-supported type for AVX1, we'll let
1860 // legalization and shuffle lowering produce the optimal codegen. If we
1861 // choose an optimal type with a vector element larger than a byte,
1862 // getMemsetStores() may create an intermediate splat (using an integer
1863 // multiply) before we splat as a vector.
1866 if (Subtarget.hasSSE2())
1868 // TODO: Can SSE1 handle a byte vector?
1869 if (Subtarget.hasSSE1())
1871 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1872 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1873 // Do not use f64 to lower memcpy if source is string constant. It's
1874 // better to use i32 to avoid the loads.
1875 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1876 // The gymnastics of splatting a byte value into an XMM register and then
1877 // only using 8-byte stores (because this is a CPU with slow unaligned
1878 // 16-byte accesses) makes that a loser.
1882 // This is a compromise. If we reach here, unaligned accesses may be slow on
1883 // this target. However, creating smaller, aligned accesses could be even
1884 // slower and would certainly be a lot more code.
1885 if (Subtarget.is64Bit() && Size >= 8)
1890 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1892 return X86ScalarSSEf32;
1893 else if (VT == MVT::f64)
1894 return X86ScalarSSEf64;
1899 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1904 switch (VT.getSizeInBits()) {
1906 // 8-byte and under are always assumed to be fast.
1910 *Fast = !Subtarget.isUnalignedMem16Slow();
1913 *Fast = !Subtarget.isUnalignedMem32Slow();
1915 // TODO: What about AVX-512 (512-bit) accesses?
1918 // Misaligned accesses of any size are always allowed.
1922 /// Return the entry encoding for a jump table in the
1923 /// current function. The returned value is a member of the
1924 /// MachineJumpTableInfo::JTEntryKind enum.
1925 unsigned X86TargetLowering::getJumpTableEncoding() const {
1926 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1928 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1929 return MachineJumpTableInfo::EK_Custom32;
1931 // Otherwise, use the normal jump table encoding heuristics.
1932 return TargetLowering::getJumpTableEncoding();
1935 bool X86TargetLowering::useSoftFloat() const {
1936 return Subtarget.useSoftFloat();
1939 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1940 ArgListTy &Args) const {
1942 // Only relabel X86-32 for C / Stdcall CCs.
1943 if (Subtarget.is64Bit())
1945 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1947 unsigned ParamRegs = 0;
1948 if (auto *M = MF->getFunction().getParent())
1949 ParamRegs = M->getNumberRegisterParameters();
1951 // Mark the first N int arguments as having reg
1952 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1953 Type *T = Args[Idx].Ty;
1954 if (T->isPointerTy() || T->isIntegerTy())
1955 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1956 unsigned numRegs = 1;
1957 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1959 if (ParamRegs < numRegs)
1961 ParamRegs -= numRegs;
1962 Args[Idx].IsInReg = true;
1968 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1969 const MachineBasicBlock *MBB,
1970 unsigned uid,MCContext &Ctx) const{
1971 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1972 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1974 return MCSymbolRefExpr::create(MBB->getSymbol(),
1975 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1978 /// Returns relocation base for the given PIC jumptable.
1979 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1980 SelectionDAG &DAG) const {
1981 if (!Subtarget.is64Bit())
1982 // This doesn't have SDLoc associated with it, but is not really the
1983 // same as a Register.
1984 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1985 getPointerTy(DAG.getDataLayout()));
1989 /// This returns the relocation base for the given PIC jumptable,
1990 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1991 const MCExpr *X86TargetLowering::
1992 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1993 MCContext &Ctx) const {
1994 // X86-64 uses RIP relative addressing based on the jump table label.
1995 if (Subtarget.isPICStyleRIPRel())
1996 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1998 // Otherwise, the reference is relative to the PIC base.
1999 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2002 std::pair<const TargetRegisterClass *, uint8_t>
2003 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2005 const TargetRegisterClass *RRC = nullptr;
2007 switch (VT.SimpleTy) {
2009 return TargetLowering::findRepresentativeClass(TRI, VT);
2010 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2011 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2014 RRC = &X86::VR64RegClass;
2016 case MVT::f32: case MVT::f64:
2017 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2018 case MVT::v4f32: case MVT::v2f64:
2019 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2020 case MVT::v8f32: case MVT::v4f64:
2021 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2022 case MVT::v16f32: case MVT::v8f64:
2023 RRC = &X86::VR128XRegClass;
2026 return std::make_pair(RRC, Cost);
2029 unsigned X86TargetLowering::getAddressSpace() const {
2030 if (Subtarget.is64Bit())
2031 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2035 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2036 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2037 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2040 static Constant* SegmentOffset(IRBuilder<> &IRB,
2041 unsigned Offset, unsigned AddressSpace) {
2042 return ConstantExpr::getIntToPtr(
2043 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2044 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2047 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2048 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2049 // tcbhead_t; use it instead of the usual global variable (see
2050 // sysdeps/{i386,x86_64}/nptl/tls.h)
2051 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2052 if (Subtarget.isTargetFuchsia()) {
2053 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2054 return SegmentOffset(IRB, 0x10, getAddressSpace());
2056 // %fs:0x28, unless we're using a Kernel code model, in which case
2057 // it's %gs:0x28. gs:0x14 on i386.
2058 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2059 return SegmentOffset(IRB, Offset, getAddressSpace());
2063 return TargetLowering::getIRStackGuard(IRB);
2066 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2067 // MSVC CRT provides functionalities for stack protection.
2068 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2069 // MSVC CRT has a global variable holding security cookie.
2070 M.getOrInsertGlobal("__security_cookie",
2071 Type::getInt8PtrTy(M.getContext()));
2073 // MSVC CRT has a function to validate security cookie.
2074 auto *SecurityCheckCookie = cast<Function>(
2075 M.getOrInsertFunction("__security_check_cookie",
2076 Type::getVoidTy(M.getContext()),
2077 Type::getInt8PtrTy(M.getContext())));
2078 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2079 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2082 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2083 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2085 TargetLowering::insertSSPDeclarations(M);
2088 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2089 // MSVC CRT has a global variable holding security cookie.
2090 if (Subtarget.getTargetTriple().isOSMSVCRT())
2091 return M.getGlobalVariable("__security_cookie");
2092 return TargetLowering::getSDagStackGuard(M);
2095 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2096 // MSVC CRT has a function to validate security cookie.
2097 if (Subtarget.getTargetTriple().isOSMSVCRT())
2098 return M.getFunction("__security_check_cookie");
2099 return TargetLowering::getSSPStackGuardCheck(M);
2102 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2103 if (Subtarget.getTargetTriple().isOSContiki())
2104 return getDefaultSafeStackPointerLocation(IRB, false);
2106 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2107 // definition of TLS_SLOT_SAFESTACK in
2108 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2109 if (Subtarget.isTargetAndroid()) {
2110 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2112 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2113 return SegmentOffset(IRB, Offset, getAddressSpace());
2116 // Fuchsia is similar.
2117 if (Subtarget.isTargetFuchsia()) {
2118 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2119 return SegmentOffset(IRB, 0x18, getAddressSpace());
2122 return TargetLowering::getSafeStackPointerLocation(IRB);
2125 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2126 unsigned DestAS) const {
2127 assert(SrcAS != DestAS && "Expected different address spaces!");
2129 return SrcAS < 256 && DestAS < 256;
2132 //===----------------------------------------------------------------------===//
2133 // Return Value Calling Convention Implementation
2134 //===----------------------------------------------------------------------===//
2136 #include "X86GenCallingConv.inc"
2138 bool X86TargetLowering::CanLowerReturn(
2139 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2140 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2141 SmallVector<CCValAssign, 16> RVLocs;
2142 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2143 return CCInfo.CheckReturn(Outs, RetCC_X86);
2146 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2147 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2151 /// Lowers masks values (v*i1) to the local register values
2152 /// \returns DAG node after lowering to register type
2153 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2154 const SDLoc &Dl, SelectionDAG &DAG) {
2155 EVT ValVT = ValArg.getValueType();
2157 if (ValVT == MVT::v1i1)
2158 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2159 DAG.getIntPtrConstant(0, Dl));
2161 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2162 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2163 // Two stage lowering might be required
2164 // bitcast: v8i1 -> i8 / v16i1 -> i16
2165 // anyextend: i8 -> i32 / i16 -> i32
2166 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2167 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2168 if (ValLoc == MVT::i32)
2169 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2171 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2172 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2173 // One stage lowering is required
2174 // bitcast: v32i1 -> i32 / v64i1 -> i64
2175 return DAG.getBitcast(ValLoc, ValArg);
2177 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2180 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2181 static void Passv64i1ArgInRegs(
2182 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2183 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2184 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2185 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2186 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2187 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2188 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2189 "The value should reside in two registers");
2191 // Before splitting the value we cast it to i64
2192 Arg = DAG.getBitcast(MVT::i64, Arg);
2194 // Splitting the value into two i32 types
2196 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2197 DAG.getConstant(0, Dl, MVT::i32));
2198 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2199 DAG.getConstant(1, Dl, MVT::i32));
2201 // Attach the two i32 types into corresponding registers
2202 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2203 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2207 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2209 const SmallVectorImpl<ISD::OutputArg> &Outs,
2210 const SmallVectorImpl<SDValue> &OutVals,
2211 const SDLoc &dl, SelectionDAG &DAG) const {
2212 MachineFunction &MF = DAG.getMachineFunction();
2213 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2215 // In some cases we need to disable registers from the default CSR list.
2216 // For example, when they are used for argument passing.
2217 bool ShouldDisableCalleeSavedRegister =
2218 CallConv == CallingConv::X86_RegCall ||
2219 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2221 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2222 report_fatal_error("X86 interrupts may not return any value");
2224 SmallVector<CCValAssign, 16> RVLocs;
2225 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2226 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2229 SmallVector<SDValue, 6> RetOps;
2230 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2231 // Operand #1 = Bytes To Pop
2232 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2235 // Copy the result values into the output registers.
2236 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2238 CCValAssign &VA = RVLocs[I];
2239 assert(VA.isRegLoc() && "Can only return in registers!");
2241 // Add the register to the CalleeSaveDisableRegs list.
2242 if (ShouldDisableCalleeSavedRegister)
2243 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2245 SDValue ValToCopy = OutVals[OutsIndex];
2246 EVT ValVT = ValToCopy.getValueType();
2248 // Promote values to the appropriate types.
2249 if (VA.getLocInfo() == CCValAssign::SExt)
2250 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2251 else if (VA.getLocInfo() == CCValAssign::ZExt)
2252 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2253 else if (VA.getLocInfo() == CCValAssign::AExt) {
2254 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2255 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2257 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2259 else if (VA.getLocInfo() == CCValAssign::BCvt)
2260 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2262 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2263 "Unexpected FP-extend for return value.");
2265 // If this is x86-64, and we disabled SSE, we can't return FP values,
2266 // or SSE or MMX vectors.
2267 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2268 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2269 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2270 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2271 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2272 } else if (ValVT == MVT::f64 &&
2273 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2274 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2275 // llvm-gcc has never done it right and no one has noticed, so this
2276 // should be OK for now.
2277 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2278 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2281 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2282 // the RET instruction and handled by the FP Stackifier.
2283 if (VA.getLocReg() == X86::FP0 ||
2284 VA.getLocReg() == X86::FP1) {
2285 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2286 // change the value to the FP stack register class.
2287 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2288 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2289 RetOps.push_back(ValToCopy);
2290 // Don't emit a copytoreg.
2294 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2295 // which is returned in RAX / RDX.
2296 if (Subtarget.is64Bit()) {
2297 if (ValVT == MVT::x86mmx) {
2298 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2299 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2300 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2302 // If we don't have SSE2 available, convert to v4f32 so the generated
2303 // register is legal.
2304 if (!Subtarget.hasSSE2())
2305 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2310 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2312 if (VA.needsCustom()) {
2313 assert(VA.getValVT() == MVT::v64i1 &&
2314 "Currently the only custom case is when we split v64i1 to 2 regs");
2316 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2319 assert(2 == RegsToPass.size() &&
2320 "Expecting two registers after Pass64BitArgInRegs");
2322 // Add the second register to the CalleeSaveDisableRegs list.
2323 if (ShouldDisableCalleeSavedRegister)
2324 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2326 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2329 // Add nodes to the DAG and add the values into the RetOps list
2330 for (auto &Reg : RegsToPass) {
2331 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2332 Flag = Chain.getValue(1);
2333 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2337 // Swift calling convention does not require we copy the sret argument
2338 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2340 // All x86 ABIs require that for returning structs by value we copy
2341 // the sret argument into %rax/%eax (depending on ABI) for the return.
2342 // We saved the argument into a virtual register in the entry block,
2343 // so now we copy the value out and into %rax/%eax.
2345 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2346 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2347 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2348 // either case FuncInfo->setSRetReturnReg() will have been called.
2349 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2350 // When we have both sret and another return value, we should use the
2351 // original Chain stored in RetOps[0], instead of the current Chain updated
2352 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2354 // For the case of sret and another return value, we have
2355 // Chain_0 at the function entry
2356 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2357 // If we use Chain_1 in getCopyFromReg, we will have
2358 // Val = getCopyFromReg(Chain_1)
2359 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2361 // getCopyToReg(Chain_0) will be glued together with
2362 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2363 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2364 // Data dependency from Unit B to Unit A due to usage of Val in
2365 // getCopyToReg(Chain_1, Val)
2366 // Chain dependency from Unit A to Unit B
2368 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2369 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2370 getPointerTy(MF.getDataLayout()));
2373 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2374 X86::RAX : X86::EAX;
2375 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2376 Flag = Chain.getValue(1);
2378 // RAX/EAX now acts like a return value.
2380 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2382 // Add the returned register to the CalleeSaveDisableRegs list.
2383 if (ShouldDisableCalleeSavedRegister)
2384 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2387 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2388 const MCPhysReg *I =
2389 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2392 if (X86::GR64RegClass.contains(*I))
2393 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2395 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2399 RetOps[0] = Chain; // Update chain.
2401 // Add the flag if we have it.
2403 RetOps.push_back(Flag);
2405 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2406 if (CallConv == CallingConv::X86_INTR)
2407 opcode = X86ISD::IRET;
2408 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2411 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2412 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2415 SDValue TCChain = Chain;
2416 SDNode *Copy = *N->use_begin();
2417 if (Copy->getOpcode() == ISD::CopyToReg) {
2418 // If the copy has a glue operand, we conservatively assume it isn't safe to
2419 // perform a tail call.
2420 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2422 TCChain = Copy->getOperand(0);
2423 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2426 bool HasRet = false;
2427 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2429 if (UI->getOpcode() != X86ISD::RET_FLAG)
2431 // If we are returning more than one value, we can definitely
2432 // not make a tail call see PR19530
2433 if (UI->getNumOperands() > 4)
2435 if (UI->getNumOperands() == 4 &&
2436 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2448 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2449 ISD::NodeType ExtendKind) const {
2450 MVT ReturnMVT = MVT::i32;
2452 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2453 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2454 // The ABI does not require i1, i8 or i16 to be extended.
2456 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2457 // always extending i8/i16 return values, so keep doing that for now.
2459 ReturnMVT = MVT::i8;
2462 EVT MinVT = getRegisterType(Context, ReturnMVT);
2463 return VT.bitsLT(MinVT) ? MinVT : VT;
2466 /// Reads two 32 bit registers and creates a 64 bit mask value.
2467 /// \param VA The current 32 bit value that need to be assigned.
2468 /// \param NextVA The next 32 bit value that need to be assigned.
2469 /// \param Root The parent DAG node.
2470 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2471 /// glue purposes. In the case the DAG is already using
2472 /// physical register instead of virtual, we should glue
2473 /// our new SDValue to InFlag SDvalue.
2474 /// \return a new SDvalue of size 64bit.
2475 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2476 SDValue &Root, SelectionDAG &DAG,
2477 const SDLoc &Dl, const X86Subtarget &Subtarget,
2478 SDValue *InFlag = nullptr) {
2479 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2480 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2481 assert(VA.getValVT() == MVT::v64i1 &&
2482 "Expecting first location of 64 bit width type");
2483 assert(NextVA.getValVT() == VA.getValVT() &&
2484 "The locations should have the same type");
2485 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2486 "The values should reside in two registers");
2490 SDValue ArgValueLo, ArgValueHi;
2492 MachineFunction &MF = DAG.getMachineFunction();
2493 const TargetRegisterClass *RC = &X86::GR32RegClass;
2495 // Read a 32 bit value from the registers
2496 if (nullptr == InFlag) {
2497 // When no physical register is present,
2498 // create an intermediate virtual register
2499 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2500 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2501 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2502 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2504 // When a physical register is available read the value from it and glue
2505 // the reads together.
2507 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2508 *InFlag = ArgValueLo.getValue(2);
2510 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2511 *InFlag = ArgValueHi.getValue(2);
2514 // Convert the i32 type into v32i1 type
2515 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2517 // Convert the i32 type into v32i1 type
2518 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2520 // Concatenate the two values together
2521 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2524 /// The function will lower a register of various sizes (8/16/32/64)
2525 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2526 /// \returns a DAG node contains the operand after lowering to mask type.
2527 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2528 const EVT &ValLoc, const SDLoc &Dl,
2529 SelectionDAG &DAG) {
2530 SDValue ValReturned = ValArg;
2532 if (ValVT == MVT::v1i1)
2533 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2535 if (ValVT == MVT::v64i1) {
2536 // In 32 bit machine, this case is handled by getv64i1Argument
2537 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2538 // In 64 bit machine, There is no need to truncate the value only bitcast
2541 switch (ValVT.getSimpleVT().SimpleTy) {
2552 llvm_unreachable("Expecting a vector of i1 types");
2555 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2557 return DAG.getBitcast(ValVT, ValReturned);
2560 /// Lower the result values of a call into the
2561 /// appropriate copies out of appropriate physical registers.
2563 SDValue X86TargetLowering::LowerCallResult(
2564 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2565 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2566 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2567 uint32_t *RegMask) const {
2569 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2570 // Assign locations to each value returned by this call.
2571 SmallVector<CCValAssign, 16> RVLocs;
2572 bool Is64Bit = Subtarget.is64Bit();
2573 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2575 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2577 // Copy all of the result registers out of their specified physreg.
2578 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2580 CCValAssign &VA = RVLocs[I];
2581 EVT CopyVT = VA.getLocVT();
2583 // In some calling conventions we need to remove the used registers
2584 // from the register mask.
2586 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2587 SubRegs.isValid(); ++SubRegs)
2588 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2591 // If this is x86-64, and we disabled SSE, we can't return FP values
2592 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2593 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2594 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2595 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2598 // If we prefer to use the value in xmm registers, copy it out as f80 and
2599 // use a truncate to move it from fp stack reg to xmm reg.
2600 bool RoundAfterCopy = false;
2601 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2602 isScalarFPTypeInSSEReg(VA.getValVT())) {
2603 if (!Subtarget.hasX87())
2604 report_fatal_error("X87 register return with X87 disabled");
2606 RoundAfterCopy = (CopyVT != VA.getLocVT());
2610 if (VA.needsCustom()) {
2611 assert(VA.getValVT() == MVT::v64i1 &&
2612 "Currently the only custom case is when we split v64i1 to 2 regs");
2614 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2616 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2618 Val = Chain.getValue(0);
2619 InFlag = Chain.getValue(2);
2623 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2624 // This truncation won't change the value.
2625 DAG.getIntPtrConstant(1, dl));
2627 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2628 if (VA.getValVT().isVector() &&
2629 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2630 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2631 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2632 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2634 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2637 InVals.push_back(Val);
2643 //===----------------------------------------------------------------------===//
2644 // C & StdCall & Fast Calling Convention implementation
2645 //===----------------------------------------------------------------------===//
2646 // StdCall calling convention seems to be standard for many Windows' API
2647 // routines and around. It differs from C calling convention just a little:
2648 // callee should clean up the stack, not caller. Symbols should be also
2649 // decorated in some fancy way :) It doesn't support any vector arguments.
2650 // For info on fast calling convention see Fast Calling Convention (tail call)
2651 // implementation LowerX86_32FastCCCallTo.
2653 /// CallIsStructReturn - Determines whether a call uses struct return
2655 enum StructReturnType {
2660 static StructReturnType
2661 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2663 return NotStructReturn;
2665 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2666 if (!Flags.isSRet())
2667 return NotStructReturn;
2668 if (Flags.isInReg() || IsMCU)
2669 return RegStructReturn;
2670 return StackStructReturn;
2673 /// Determines whether a function uses struct return semantics.
2674 static StructReturnType
2675 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2677 return NotStructReturn;
2679 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2680 if (!Flags.isSRet())
2681 return NotStructReturn;
2682 if (Flags.isInReg() || IsMCU)
2683 return RegStructReturn;
2684 return StackStructReturn;
2687 /// Make a copy of an aggregate at address specified by "Src" to address
2688 /// "Dst" with size and alignment information specified by the specific
2689 /// parameter attribute. The copy will be passed as a byval function parameter.
2690 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2691 SDValue Chain, ISD::ArgFlagsTy Flags,
2692 SelectionDAG &DAG, const SDLoc &dl) {
2693 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2695 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2696 /*isVolatile*/false, /*AlwaysInline=*/true,
2697 /*isTailCall*/false,
2698 MachinePointerInfo(), MachinePointerInfo());
2701 /// Return true if the calling convention is one that we can guarantee TCO for.
2702 static bool canGuaranteeTCO(CallingConv::ID CC) {
2703 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2704 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2705 CC == CallingConv::HHVM);
2708 /// Return true if we might ever do TCO for calls with this calling convention.
2709 static bool mayTailCallThisCC(CallingConv::ID CC) {
2711 // C calling conventions:
2712 case CallingConv::C:
2713 case CallingConv::Win64:
2714 case CallingConv::X86_64_SysV:
2715 // Callee pop conventions:
2716 case CallingConv::X86_ThisCall:
2717 case CallingConv::X86_StdCall:
2718 case CallingConv::X86_VectorCall:
2719 case CallingConv::X86_FastCall:
2722 return canGuaranteeTCO(CC);
2726 /// Return true if the function is being made into a tailcall target by
2727 /// changing its ABI.
2728 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2729 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2732 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2734 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2735 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2738 ImmutableCallSite CS(CI);
2739 CallingConv::ID CalleeCC = CS.getCallingConv();
2740 if (!mayTailCallThisCC(CalleeCC))
2747 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2748 const SmallVectorImpl<ISD::InputArg> &Ins,
2749 const SDLoc &dl, SelectionDAG &DAG,
2750 const CCValAssign &VA,
2751 MachineFrameInfo &MFI, unsigned i) const {
2752 // Create the nodes corresponding to a load from this parameter slot.
2753 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2754 bool AlwaysUseMutable = shouldGuaranteeTCO(
2755 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2756 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2758 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2760 // If value is passed by pointer we have address passed instead of the value
2761 // itself. No need to extend if the mask value and location share the same
2763 bool ExtendedInMem =
2764 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2765 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2767 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2768 ValVT = VA.getLocVT();
2770 ValVT = VA.getValVT();
2772 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2773 // taken by a return address.
2775 if (CallConv == CallingConv::X86_INTR) {
2776 // X86 interrupts may take one or two arguments.
2777 // On the stack there will be no return address as in regular call.
2778 // Offset of last argument need to be set to -4/-8 bytes.
2779 // Where offset of the first argument out of two, should be set to 0 bytes.
2780 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2781 if (Subtarget.is64Bit() && Ins.size() == 2) {
2782 // The stack pointer needs to be realigned for 64 bit handlers with error
2783 // code, so the argument offset changes by 8 bytes.
2788 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2789 // changed with more analysis.
2790 // In case of tail call optimization mark all arguments mutable. Since they
2791 // could be overwritten by lowering of arguments in case of a tail call.
2792 if (Flags.isByVal()) {
2793 unsigned Bytes = Flags.getByValSize();
2794 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2795 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2796 // Adjust SP offset of interrupt parameter.
2797 if (CallConv == CallingConv::X86_INTR) {
2798 MFI.setObjectOffset(FI, Offset);
2800 return DAG.getFrameIndex(FI, PtrVT);
2803 // This is an argument in memory. We might be able to perform copy elision.
2804 if (Flags.isCopyElisionCandidate()) {
2805 EVT ArgVT = Ins[i].ArgVT;
2807 if (Ins[i].PartOffset == 0) {
2808 // If this is a one-part value or the first part of a multi-part value,
2809 // create a stack object for the entire argument value type and return a
2810 // load from our portion of it. This assumes that if the first part of an
2811 // argument is in memory, the rest will also be in memory.
2812 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2813 /*Immutable=*/false);
2814 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2816 ValVT, dl, Chain, PartAddr,
2817 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2819 // This is not the first piece of an argument in memory. See if there is
2820 // already a fixed stack object including this offset. If so, assume it
2821 // was created by the PartOffset == 0 branch above and create a load from
2822 // the appropriate offset into it.
2823 int64_t PartBegin = VA.getLocMemOffset();
2824 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2825 int FI = MFI.getObjectIndexBegin();
2826 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2827 int64_t ObjBegin = MFI.getObjectOffset(FI);
2828 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2829 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2832 if (MFI.isFixedObjectIndex(FI)) {
2834 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2835 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2837 ValVT, dl, Chain, Addr,
2838 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2839 Ins[i].PartOffset));
2844 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2845 VA.getLocMemOffset(), isImmutable);
2847 // Set SExt or ZExt flag.
2848 if (VA.getLocInfo() == CCValAssign::ZExt) {
2849 MFI.setObjectZExt(FI, true);
2850 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2851 MFI.setObjectSExt(FI, true);
2854 // Adjust SP offset of interrupt parameter.
2855 if (CallConv == CallingConv::X86_INTR) {
2856 MFI.setObjectOffset(FI, Offset);
2859 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2860 SDValue Val = DAG.getLoad(
2861 ValVT, dl, Chain, FIN,
2862 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2863 return ExtendedInMem
2864 ? (VA.getValVT().isVector()
2865 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2866 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2870 // FIXME: Get this from tablegen.
2871 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2872 const X86Subtarget &Subtarget) {
2873 assert(Subtarget.is64Bit());
2875 if (Subtarget.isCallingConvWin64(CallConv)) {
2876 static const MCPhysReg GPR64ArgRegsWin64[] = {
2877 X86::RCX, X86::RDX, X86::R8, X86::R9
2879 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2882 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2883 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2885 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2888 // FIXME: Get this from tablegen.
2889 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2890 CallingConv::ID CallConv,
2891 const X86Subtarget &Subtarget) {
2892 assert(Subtarget.is64Bit());
2893 if (Subtarget.isCallingConvWin64(CallConv)) {
2894 // The XMM registers which might contain var arg parameters are shadowed
2895 // in their paired GPR. So we only need to save the GPR to their home
2897 // TODO: __vectorcall will change this.
2901 const Function &F = MF.getFunction();
2902 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
2903 bool isSoftFloat = Subtarget.useSoftFloat();
2904 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2905 "SSE register cannot be used when SSE is disabled!");
2906 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2907 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2911 static const MCPhysReg XMMArgRegs64Bit[] = {
2912 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2913 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2915 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2919 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2920 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2921 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2922 return A.getValNo() < B.getValNo();
2927 SDValue X86TargetLowering::LowerFormalArguments(
2928 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2929 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2930 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2931 MachineFunction &MF = DAG.getMachineFunction();
2932 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2933 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2935 const Function &F = MF.getFunction();
2936 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
2937 F.getName() == "main")
2938 FuncInfo->setForceFramePointer(true);
2940 MachineFrameInfo &MFI = MF.getFrameInfo();
2941 bool Is64Bit = Subtarget.is64Bit();
2942 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2945 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2946 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2948 if (CallConv == CallingConv::X86_INTR) {
2949 bool isLegal = Ins.size() == 1 ||
2950 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2951 (!Is64Bit && Ins[1].VT == MVT::i32)));
2953 report_fatal_error("X86 interrupts may take one or two arguments");
2956 // Assign locations to all of the incoming arguments.
2957 SmallVector<CCValAssign, 16> ArgLocs;
2958 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2960 // Allocate shadow area for Win64.
2962 CCInfo.AllocateStack(32, 8);
2964 CCInfo.AnalyzeArguments(Ins, CC_X86);
2966 // In vectorcall calling convention a second pass is required for the HVA
2968 if (CallingConv::X86_VectorCall == CallConv) {
2969 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2972 // The next loop assumes that the locations are in the same order of the
2974 assert(isSortedByValueNo(ArgLocs) &&
2975 "Argument Location list must be sorted before lowering");
2978 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2980 assert(InsIndex < Ins.size() && "Invalid Ins index");
2981 CCValAssign &VA = ArgLocs[I];
2983 if (VA.isRegLoc()) {
2984 EVT RegVT = VA.getLocVT();
2985 if (VA.needsCustom()) {
2987 VA.getValVT() == MVT::v64i1 &&
2988 "Currently the only custom case is when we split v64i1 to 2 regs");
2990 // v64i1 values, in regcall calling convention, that are
2991 // compiled to 32 bit arch, are split up into two registers.
2993 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2995 const TargetRegisterClass *RC;
2996 if (RegVT == MVT::i32)
2997 RC = &X86::GR32RegClass;
2998 else if (Is64Bit && RegVT == MVT::i64)
2999 RC = &X86::GR64RegClass;
3000 else if (RegVT == MVT::f32)
3001 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3002 else if (RegVT == MVT::f64)
3003 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3004 else if (RegVT == MVT::f80)
3005 RC = &X86::RFP80RegClass;
3006 else if (RegVT == MVT::f128)
3007 RC = &X86::FR128RegClass;
3008 else if (RegVT.is512BitVector())
3009 RC = &X86::VR512RegClass;
3010 else if (RegVT.is256BitVector())
3011 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3012 else if (RegVT.is128BitVector())
3013 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3014 else if (RegVT == MVT::x86mmx)
3015 RC = &X86::VR64RegClass;
3016 else if (RegVT == MVT::v1i1)
3017 RC = &X86::VK1RegClass;
3018 else if (RegVT == MVT::v8i1)
3019 RC = &X86::VK8RegClass;
3020 else if (RegVT == MVT::v16i1)
3021 RC = &X86::VK16RegClass;
3022 else if (RegVT == MVT::v32i1)
3023 RC = &X86::VK32RegClass;
3024 else if (RegVT == MVT::v64i1)
3025 RC = &X86::VK64RegClass;
3027 llvm_unreachable("Unknown argument type!");
3029 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3030 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3033 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3034 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3036 if (VA.getLocInfo() == CCValAssign::SExt)
3037 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3038 DAG.getValueType(VA.getValVT()));
3039 else if (VA.getLocInfo() == CCValAssign::ZExt)
3040 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3041 DAG.getValueType(VA.getValVT()));
3042 else if (VA.getLocInfo() == CCValAssign::BCvt)
3043 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3045 if (VA.isExtInLoc()) {
3046 // Handle MMX values passed in XMM regs.
3047 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3048 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3049 else if (VA.getValVT().isVector() &&
3050 VA.getValVT().getScalarType() == MVT::i1 &&
3051 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3052 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3053 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3054 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3056 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3059 assert(VA.isMemLoc());
3061 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3064 // If value is passed via pointer - do a load.
3065 if (VA.getLocInfo() == CCValAssign::Indirect)
3067 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3069 InVals.push_back(ArgValue);
3072 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3073 // Swift calling convention does not require we copy the sret argument
3074 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3075 if (CallConv == CallingConv::Swift)
3078 // All x86 ABIs require that for returning structs by value we copy the
3079 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3080 // the argument into a virtual register so that we can access it from the
3082 if (Ins[I].Flags.isSRet()) {
3083 unsigned Reg = FuncInfo->getSRetReturnReg();
3085 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3086 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3087 FuncInfo->setSRetReturnReg(Reg);
3089 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3090 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3095 unsigned StackSize = CCInfo.getNextStackOffset();
3096 // Align stack specially for tail calls.
3097 if (shouldGuaranteeTCO(CallConv,
3098 MF.getTarget().Options.GuaranteedTailCallOpt))
3099 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3101 // If the function takes variable number of arguments, make a frame index for
3102 // the start of the first vararg value... for expansion of llvm.va_start. We
3103 // can skip this if there are no va_start calls.
3104 if (MFI.hasVAStart() &&
3105 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3106 CallConv != CallingConv::X86_ThisCall))) {
3107 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3110 // Figure out if XMM registers are in use.
3111 assert(!(Subtarget.useSoftFloat() &&
3112 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3113 "SSE register cannot be used when SSE is disabled!");
3115 // 64-bit calling conventions support varargs and register parameters, so we
3116 // have to do extra work to spill them in the prologue.
3117 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3118 // Find the first unallocated argument registers.
3119 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3120 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3121 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3122 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3123 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3124 "SSE register cannot be used when SSE is disabled!");
3126 // Gather all the live in physical registers.
3127 SmallVector<SDValue, 6> LiveGPRs;
3128 SmallVector<SDValue, 8> LiveXMMRegs;
3130 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3131 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3133 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3135 if (!ArgXMMs.empty()) {
3136 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3137 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3138 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3139 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3140 LiveXMMRegs.push_back(
3141 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3146 // Get to the caller-allocated home save location. Add 8 to account
3147 // for the return address.
3148 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3149 FuncInfo->setRegSaveFrameIndex(
3150 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3151 // Fixup to set vararg frame on shadow area (4 x i64).
3153 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3155 // For X86-64, if there are vararg parameters that are passed via
3156 // registers, then we must store them to their spots on the stack so
3157 // they may be loaded by dereferencing the result of va_next.
3158 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3159 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3160 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3161 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3164 // Store the integer parameter registers.
3165 SmallVector<SDValue, 8> MemOps;
3166 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3167 getPointerTy(DAG.getDataLayout()));
3168 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3169 for (SDValue Val : LiveGPRs) {
3170 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3171 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3173 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3174 MachinePointerInfo::getFixedStack(
3175 DAG.getMachineFunction(),
3176 FuncInfo->getRegSaveFrameIndex(), Offset));
3177 MemOps.push_back(Store);
3181 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3182 // Now store the XMM (fp + vector) parameter registers.
3183 SmallVector<SDValue, 12> SaveXMMOps;
3184 SaveXMMOps.push_back(Chain);
3185 SaveXMMOps.push_back(ALVal);
3186 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3187 FuncInfo->getRegSaveFrameIndex(), dl));
3188 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3189 FuncInfo->getVarArgsFPOffset(), dl));
3190 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3192 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3193 MVT::Other, SaveXMMOps));
3196 if (!MemOps.empty())
3197 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3200 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3201 // Find the largest legal vector type.
3202 MVT VecVT = MVT::Other;
3203 // FIXME: Only some x86_32 calling conventions support AVX512.
3204 if (Subtarget.hasAVX512() &&
3205 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3206 CallConv == CallingConv::Intel_OCL_BI)))
3207 VecVT = MVT::v16f32;
3208 else if (Subtarget.hasAVX())
3210 else if (Subtarget.hasSSE2())
3213 // We forward some GPRs and some vector types.
3214 SmallVector<MVT, 2> RegParmTypes;
3215 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3216 RegParmTypes.push_back(IntVT);
3217 if (VecVT != MVT::Other)
3218 RegParmTypes.push_back(VecVT);
3220 // Compute the set of forwarded registers. The rest are scratch.
3221 SmallVectorImpl<ForwardedRegister> &Forwards =
3222 FuncInfo->getForwardedMustTailRegParms();
3223 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3225 // Conservatively forward AL on x86_64, since it might be used for varargs.
3226 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3227 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3228 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3231 // Copy all forwards from physical to virtual registers.
3232 for (ForwardedRegister &F : Forwards) {
3233 // FIXME: Can we use a less constrained schedule?
3234 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3235 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3236 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3240 // Some CCs need callee pop.
3241 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3242 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3243 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3244 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3245 // X86 interrupts must pop the error code (and the alignment padding) if
3247 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3249 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3250 // If this is an sret function, the return should pop the hidden pointer.
3251 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3252 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3253 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3254 FuncInfo->setBytesToPopOnReturn(4);
3258 // RegSaveFrameIndex is X86-64 only.
3259 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3260 if (CallConv == CallingConv::X86_FastCall ||
3261 CallConv == CallingConv::X86_ThisCall)
3262 // fastcc functions can't have varargs.
3263 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3266 FuncInfo->setArgumentStackSize(StackSize);
3268 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3269 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3270 if (Personality == EHPersonality::CoreCLR) {
3272 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3273 // that we'd prefer this slot be allocated towards the bottom of the frame
3274 // (i.e. near the stack pointer after allocating the frame). Every
3275 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3276 // offset from the bottom of this and each funclet's frame must be the
3277 // same, so the size of funclets' (mostly empty) frames is dictated by
3278 // how far this slot is from the bottom (since they allocate just enough
3279 // space to accommodate holding this slot at the correct offset).
3280 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3281 EHInfo->PSPSymFrameIdx = PSPSymFI;
3285 if (CallConv == CallingConv::X86_RegCall ||
3286 F.hasFnAttribute("no_caller_saved_registers")) {
3287 MachineRegisterInfo &MRI = MF.getRegInfo();
3288 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3289 MRI.disableCalleeSavedRegister(Pair.first);
3295 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3296 SDValue Arg, const SDLoc &dl,
3298 const CCValAssign &VA,
3299 ISD::ArgFlagsTy Flags) const {
3300 unsigned LocMemOffset = VA.getLocMemOffset();
3301 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3302 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3304 if (Flags.isByVal())
3305 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3307 return DAG.getStore(
3308 Chain, dl, Arg, PtrOff,
3309 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3312 /// Emit a load of return address if tail call
3313 /// optimization is performed and it is required.
3314 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3315 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3316 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3317 // Adjust the Return address stack slot.
3318 EVT VT = getPointerTy(DAG.getDataLayout());
3319 OutRetAddr = getReturnAddressFrameIndex(DAG);
3321 // Load the "old" Return address.
3322 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3323 return SDValue(OutRetAddr.getNode(), 1);
3326 /// Emit a store of the return address if tail call
3327 /// optimization is performed and it is required (FPDiff!=0).
3328 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3329 SDValue Chain, SDValue RetAddrFrIdx,
3330 EVT PtrVT, unsigned SlotSize,
3331 int FPDiff, const SDLoc &dl) {
3332 // Store the return address to the appropriate stack slot.
3333 if (!FPDiff) return Chain;
3334 // Calculate the new stack slot for the return address.
3335 int NewReturnAddrFI =
3336 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3338 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3339 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3340 MachinePointerInfo::getFixedStack(
3341 DAG.getMachineFunction(), NewReturnAddrFI));
3345 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3346 /// operation of specified width.
3347 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3349 unsigned NumElems = VT.getVectorNumElements();
3350 SmallVector<int, 8> Mask;
3351 Mask.push_back(NumElems);
3352 for (unsigned i = 1; i != NumElems; ++i)
3354 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3358 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3359 SmallVectorImpl<SDValue> &InVals) const {
3360 SelectionDAG &DAG = CLI.DAG;
3362 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3363 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3364 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3365 SDValue Chain = CLI.Chain;
3366 SDValue Callee = CLI.Callee;
3367 CallingConv::ID CallConv = CLI.CallConv;
3368 bool &isTailCall = CLI.IsTailCall;
3369 bool isVarArg = CLI.IsVarArg;
3371 MachineFunction &MF = DAG.getMachineFunction();
3372 bool Is64Bit = Subtarget.is64Bit();
3373 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3374 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3375 bool IsSibcall = false;
3376 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3377 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3378 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3379 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3380 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3381 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3383 if (CallConv == CallingConv::X86_INTR)
3384 report_fatal_error("X86 interrupts may not be called directly");
3386 if (Attr.getValueAsString() == "true")
3389 if (Subtarget.isPICStyleGOT() &&
3390 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3391 // If we are using a GOT, disable tail calls to external symbols with
3392 // default visibility. Tail calling such a symbol requires using a GOT
3393 // relocation, which forces early binding of the symbol. This breaks code
3394 // that require lazy function symbol resolution. Using musttail or
3395 // GuaranteedTailCallOpt will override this.
3396 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3397 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3398 G->getGlobal()->hasDefaultVisibility()))
3402 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3404 // Force this to be a tail call. The verifier rules are enough to ensure
3405 // that we can lower this successfully without moving the return address
3408 } else if (isTailCall) {
3409 // Check if it's really possible to do a tail call.
3410 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3411 isVarArg, SR != NotStructReturn,
3412 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3413 Outs, OutVals, Ins, DAG);
3415 // Sibcalls are automatically detected tailcalls which do not require
3417 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3424 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3425 "Var args not supported with calling convention fastcc, ghc or hipe");
3427 // Analyze operands of the call, assigning locations to each operand.
3428 SmallVector<CCValAssign, 16> ArgLocs;
3429 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3431 // Allocate shadow area for Win64.
3433 CCInfo.AllocateStack(32, 8);
3435 CCInfo.AnalyzeArguments(Outs, CC_X86);
3437 // In vectorcall calling convention a second pass is required for the HVA
3439 if (CallingConv::X86_VectorCall == CallConv) {
3440 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3443 // Get a count of how many bytes are to be pushed on the stack.
3444 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3446 // This is a sibcall. The memory operands are available in caller's
3447 // own caller's stack.
3449 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3450 canGuaranteeTCO(CallConv))
3451 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3454 if (isTailCall && !IsSibcall && !IsMustTail) {
3455 // Lower arguments at fp - stackoffset + fpdiff.
3456 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3458 FPDiff = NumBytesCallerPushed - NumBytes;
3460 // Set the delta of movement of the returnaddr stackslot.
3461 // But only set if delta is greater than previous delta.
3462 if (FPDiff < X86Info->getTCReturnAddrDelta())
3463 X86Info->setTCReturnAddrDelta(FPDiff);
3466 unsigned NumBytesToPush = NumBytes;
3467 unsigned NumBytesToPop = NumBytes;
3469 // If we have an inalloca argument, all stack space has already been allocated
3470 // for us and be right at the top of the stack. We don't support multiple
3471 // arguments passed in memory when using inalloca.
3472 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3474 if (!ArgLocs.back().isMemLoc())
3475 report_fatal_error("cannot use inalloca attribute on a register "
3477 if (ArgLocs.back().getLocMemOffset() != 0)
3478 report_fatal_error("any parameter with the inalloca attribute must be "
3479 "the only memory argument");
3483 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3484 NumBytes - NumBytesToPush, dl);
3486 SDValue RetAddrFrIdx;
3487 // Load return address for tail calls.
3488 if (isTailCall && FPDiff)
3489 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3490 Is64Bit, FPDiff, dl);
3492 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3493 SmallVector<SDValue, 8> MemOpChains;
3496 // The next loop assumes that the locations are in the same order of the
3498 assert(isSortedByValueNo(ArgLocs) &&
3499 "Argument Location list must be sorted before lowering");
3501 // Walk the register/memloc assignments, inserting copies/loads. In the case
3502 // of tail call optimization arguments are handle later.
3503 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3504 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3506 assert(OutIndex < Outs.size() && "Invalid Out index");
3507 // Skip inalloca arguments, they have already been written.
3508 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3509 if (Flags.isInAlloca())
3512 CCValAssign &VA = ArgLocs[I];
3513 EVT RegVT = VA.getLocVT();
3514 SDValue Arg = OutVals[OutIndex];
3515 bool isByVal = Flags.isByVal();
3517 // Promote the value if needed.
3518 switch (VA.getLocInfo()) {
3519 default: llvm_unreachable("Unknown loc info!");
3520 case CCValAssign::Full: break;
3521 case CCValAssign::SExt:
3522 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3524 case CCValAssign::ZExt:
3525 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3527 case CCValAssign::AExt:
3528 if (Arg.getValueType().isVector() &&
3529 Arg.getValueType().getVectorElementType() == MVT::i1)
3530 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3531 else if (RegVT.is128BitVector()) {
3532 // Special case: passing MMX values in XMM registers.
3533 Arg = DAG.getBitcast(MVT::i64, Arg);
3534 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3535 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3537 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3539 case CCValAssign::BCvt:
3540 Arg = DAG.getBitcast(RegVT, Arg);
3542 case CCValAssign::Indirect: {
3543 // Store the argument.
3544 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3545 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3546 Chain = DAG.getStore(
3547 Chain, dl, Arg, SpillSlot,
3548 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3554 if (VA.needsCustom()) {
3555 assert(VA.getValVT() == MVT::v64i1 &&
3556 "Currently the only custom case is when we split v64i1 to 2 regs");
3557 // Split v64i1 value into two registers
3558 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3560 } else if (VA.isRegLoc()) {
3561 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3562 if (isVarArg && IsWin64) {
3563 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3564 // shadow reg if callee is a varargs function.
3565 unsigned ShadowReg = 0;
3566 switch (VA.getLocReg()) {
3567 case X86::XMM0: ShadowReg = X86::RCX; break;
3568 case X86::XMM1: ShadowReg = X86::RDX; break;
3569 case X86::XMM2: ShadowReg = X86::R8; break;
3570 case X86::XMM3: ShadowReg = X86::R9; break;
3573 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3575 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3576 assert(VA.isMemLoc());
3577 if (!StackPtr.getNode())
3578 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3579 getPointerTy(DAG.getDataLayout()));
3580 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3581 dl, DAG, VA, Flags));
3585 if (!MemOpChains.empty())
3586 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3588 if (Subtarget.isPICStyleGOT()) {
3589 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3592 RegsToPass.push_back(std::make_pair(
3593 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3594 getPointerTy(DAG.getDataLayout()))));
3596 // If we are tail calling and generating PIC/GOT style code load the
3597 // address of the callee into ECX. The value in ecx is used as target of
3598 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3599 // for tail calls on PIC/GOT architectures. Normally we would just put the
3600 // address of GOT into ebx and then call target@PLT. But for tail calls
3601 // ebx would be restored (since ebx is callee saved) before jumping to the
3604 // Note: The actual moving to ECX is done further down.
3605 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3606 if (G && !G->getGlobal()->hasLocalLinkage() &&
3607 G->getGlobal()->hasDefaultVisibility())
3608 Callee = LowerGlobalAddress(Callee, DAG);
3609 else if (isa<ExternalSymbolSDNode>(Callee))
3610 Callee = LowerExternalSymbol(Callee, DAG);
3614 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3615 // From AMD64 ABI document:
3616 // For calls that may call functions that use varargs or stdargs
3617 // (prototype-less calls or calls to functions containing ellipsis (...) in
3618 // the declaration) %al is used as hidden argument to specify the number
3619 // of SSE registers used. The contents of %al do not need to match exactly
3620 // the number of registers, but must be an ubound on the number of SSE
3621 // registers used and is in the range 0 - 8 inclusive.
3623 // Count the number of XMM registers allocated.
3624 static const MCPhysReg XMMArgRegs[] = {
3625 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3626 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3628 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3629 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3630 && "SSE registers cannot be used when SSE is disabled");
3632 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3633 DAG.getConstant(NumXMMRegs, dl,
3637 if (isVarArg && IsMustTail) {
3638 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3639 for (const auto &F : Forwards) {
3640 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3641 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3645 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3646 // don't need this because the eligibility check rejects calls that require
3647 // shuffling arguments passed in memory.
3648 if (!IsSibcall && isTailCall) {
3649 // Force all the incoming stack arguments to be loaded from the stack
3650 // before any new outgoing arguments are stored to the stack, because the
3651 // outgoing stack slots may alias the incoming argument stack slots, and
3652 // the alias isn't otherwise explicit. This is slightly more conservative
3653 // than necessary, because it means that each store effectively depends
3654 // on every argument instead of just those arguments it would clobber.
3655 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3657 SmallVector<SDValue, 8> MemOpChains2;
3660 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3662 CCValAssign &VA = ArgLocs[I];
3664 if (VA.isRegLoc()) {
3665 if (VA.needsCustom()) {
3666 assert((CallConv == CallingConv::X86_RegCall) &&
3667 "Expecting custom case only in regcall calling convention");
3668 // This means that we are in special case where one argument was
3669 // passed through two register locations - Skip the next location
3676 assert(VA.isMemLoc());
3677 SDValue Arg = OutVals[OutsIndex];
3678 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3679 // Skip inalloca arguments. They don't require any work.
3680 if (Flags.isInAlloca())
3682 // Create frame index.
3683 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3684 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3685 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3686 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3688 if (Flags.isByVal()) {
3689 // Copy relative to framepointer.
3690 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3691 if (!StackPtr.getNode())
3692 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3693 getPointerTy(DAG.getDataLayout()));
3694 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3697 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3701 // Store relative to framepointer.
3702 MemOpChains2.push_back(DAG.getStore(
3703 ArgChain, dl, Arg, FIN,
3704 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3708 if (!MemOpChains2.empty())
3709 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3711 // Store the return address to the appropriate stack slot.
3712 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3713 getPointerTy(DAG.getDataLayout()),
3714 RegInfo->getSlotSize(), FPDiff, dl);
3717 // Build a sequence of copy-to-reg nodes chained together with token chain
3718 // and flag operands which copy the outgoing args into registers.
3720 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3721 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3722 RegsToPass[i].second, InFlag);
3723 InFlag = Chain.getValue(1);
3726 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3727 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3728 // In the 64-bit large code model, we have to make all calls
3729 // through a register, since the call instruction's 32-bit
3730 // pc-relative offset may not be large enough to hold the whole
3732 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3733 // If the callee is a GlobalAddress node (quite common, every direct call
3734 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3736 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3738 // We should use extra load for direct calls to dllimported functions in
3740 const GlobalValue *GV = G->getGlobal();
3741 if (!GV->hasDLLImportStorageClass()) {
3742 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3744 Callee = DAG.getTargetGlobalAddress(
3745 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3747 if (OpFlags == X86II::MO_GOTPCREL) {
3749 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3750 getPointerTy(DAG.getDataLayout()), Callee);
3751 // Add extra indirection
3752 Callee = DAG.getLoad(
3753 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3754 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3757 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3758 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
3759 unsigned char OpFlags =
3760 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3762 Callee = DAG.getTargetExternalSymbol(
3763 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3764 } else if (Subtarget.isTarget64BitILP32() &&
3765 Callee->getValueType(0) == MVT::i32) {
3766 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3767 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3770 // Returns a chain & a flag for retval copy to use.
3771 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3772 SmallVector<SDValue, 8> Ops;
3774 if (!IsSibcall && isTailCall) {
3775 Chain = DAG.getCALLSEQ_END(Chain,
3776 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3777 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3778 InFlag = Chain.getValue(1);
3781 Ops.push_back(Chain);
3782 Ops.push_back(Callee);
3785 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3787 // Add argument registers to the end of the list so that they are known live
3789 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3790 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3791 RegsToPass[i].second.getValueType()));
3793 // Add a register mask operand representing the call-preserved registers.
3794 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3795 // set X86_INTR calling convention because it has the same CSR mask
3796 // (same preserved registers).
3797 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3798 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3799 assert(Mask && "Missing call preserved mask for calling convention");
3801 // If this is an invoke in a 32-bit function using a funclet-based
3802 // personality, assume the function clobbers all registers. If an exception
3803 // is thrown, the runtime will not restore CSRs.
3804 // FIXME: Model this more precisely so that we can register allocate across
3805 // the normal edge and spill and fill across the exceptional edge.
3806 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
3807 const Function &CallerFn = MF.getFunction();
3808 EHPersonality Pers =
3809 CallerFn.hasPersonalityFn()
3810 ? classifyEHPersonality(CallerFn.getPersonalityFn())
3811 : EHPersonality::Unknown;
3812 if (isFuncletEHPersonality(Pers))
3813 Mask = RegInfo->getNoPreservedMask();
3816 // Define a new register mask from the existing mask.
3817 uint32_t *RegMask = nullptr;
3819 // In some calling conventions we need to remove the used physical registers
3820 // from the reg mask.
3821 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3822 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3824 // Allocate a new Reg Mask and copy Mask.
3825 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3826 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3827 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3829 // Make sure all sub registers of the argument registers are reset
3831 for (auto const &RegPair : RegsToPass)
3832 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3833 SubRegs.isValid(); ++SubRegs)
3834 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3836 // Create the RegMask Operand according to our updated mask.
3837 Ops.push_back(DAG.getRegisterMask(RegMask));
3839 // Create the RegMask Operand according to the static mask.
3840 Ops.push_back(DAG.getRegisterMask(Mask));
3843 if (InFlag.getNode())
3844 Ops.push_back(InFlag);
3848 //// If this is the first return lowered for this function, add the regs
3849 //// to the liveout set for the function.
3850 // This isn't right, although it's probably harmless on x86; liveouts
3851 // should be computed from returns not tail calls. Consider a void
3852 // function making a tail call to a function returning int.
3853 MF.getFrameInfo().setHasTailCall();
3854 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3857 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3858 InFlag = Chain.getValue(1);
3860 // Create the CALLSEQ_END node.
3861 unsigned NumBytesForCalleeToPop;
3862 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3863 DAG.getTarget().Options.GuaranteedTailCallOpt))
3864 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3865 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3866 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3867 SR == StackStructReturn)
3868 // If this is a call to a struct-return function, the callee
3869 // pops the hidden struct pointer, so we have to push it back.
3870 // This is common for Darwin/X86, Linux & Mingw32 targets.
3871 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3872 NumBytesForCalleeToPop = 4;
3874 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3876 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3877 // No need to reset the stack after the call if the call doesn't return. To
3878 // make the MI verify, we'll pretend the callee does it for us.
3879 NumBytesForCalleeToPop = NumBytes;
3882 // Returns a flag for retval copy to use.
3884 Chain = DAG.getCALLSEQ_END(Chain,
3885 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3886 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3889 InFlag = Chain.getValue(1);
3892 // Handle result values, copying them out of physregs into vregs that we
3894 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3898 //===----------------------------------------------------------------------===//
3899 // Fast Calling Convention (tail call) implementation
3900 //===----------------------------------------------------------------------===//
3902 // Like std call, callee cleans arguments, convention except that ECX is
3903 // reserved for storing the tail called function address. Only 2 registers are
3904 // free for argument passing (inreg). Tail call optimization is performed
3906 // * tailcallopt is enabled
3907 // * caller/callee are fastcc
3908 // On X86_64 architecture with GOT-style position independent code only local
3909 // (within module) calls are supported at the moment.
3910 // To keep the stack aligned according to platform abi the function
3911 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3912 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3913 // If a tail called function callee has more arguments than the caller the
3914 // caller needs to make sure that there is room to move the RETADDR to. This is
3915 // achieved by reserving an area the size of the argument delta right after the
3916 // original RETADDR, but before the saved framepointer or the spilled registers
3917 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3929 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3932 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3933 SelectionDAG& DAG) const {
3934 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3935 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3936 unsigned StackAlignment = TFI.getStackAlignment();
3937 uint64_t AlignMask = StackAlignment - 1;
3938 int64_t Offset = StackSize;
3939 unsigned SlotSize = RegInfo->getSlotSize();
3940 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3941 // Number smaller than 12 so just add the difference.
3942 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3944 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3945 Offset = ((~AlignMask) & Offset) + StackAlignment +
3946 (StackAlignment-SlotSize);
3951 /// Return true if the given stack call argument is already available in the
3952 /// same position (relatively) of the caller's incoming argument stack.
3954 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3955 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3956 const X86InstrInfo *TII, const CCValAssign &VA) {
3957 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3960 // Look through nodes that don't alter the bits of the incoming value.
3961 unsigned Op = Arg.getOpcode();
3962 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3963 Arg = Arg.getOperand(0);
3966 if (Op == ISD::TRUNCATE) {
3967 const SDValue &TruncInput = Arg.getOperand(0);
3968 if (TruncInput.getOpcode() == ISD::AssertZext &&
3969 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3970 Arg.getValueType()) {
3971 Arg = TruncInput.getOperand(0);
3979 if (Arg.getOpcode() == ISD::CopyFromReg) {
3980 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3981 if (!TargetRegisterInfo::isVirtualRegister(VR))
3983 MachineInstr *Def = MRI->getVRegDef(VR);
3986 if (!Flags.isByVal()) {
3987 if (!TII->isLoadFromStackSlot(*Def, FI))
3990 unsigned Opcode = Def->getOpcode();
3991 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3992 Opcode == X86::LEA64_32r) &&
3993 Def->getOperand(1).isFI()) {
3994 FI = Def->getOperand(1).getIndex();
3995 Bytes = Flags.getByValSize();
3999 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4000 if (Flags.isByVal())
4001 // ByVal argument is passed in as a pointer but it's now being
4002 // dereferenced. e.g.
4003 // define @foo(%struct.X* %A) {
4004 // tail call @bar(%struct.X* byval %A)
4007 SDValue Ptr = Ld->getBasePtr();
4008 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4011 FI = FINode->getIndex();
4012 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4013 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4014 FI = FINode->getIndex();
4015 Bytes = Flags.getByValSize();
4019 assert(FI != INT_MAX);
4020 if (!MFI.isFixedObjectIndex(FI))
4023 if (Offset != MFI.getObjectOffset(FI))
4026 // If this is not byval, check that the argument stack object is immutable.
4027 // inalloca and argument copy elision can create mutable argument stack
4028 // objects. Byval objects can be mutated, but a byval call intends to pass the
4030 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4033 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4034 // If the argument location is wider than the argument type, check that any
4035 // extension flags match.
4036 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4037 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4042 return Bytes == MFI.getObjectSize(FI);
4045 /// Check whether the call is eligible for tail call optimization. Targets
4046 /// that want to do tail call optimization should implement this function.
4047 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4048 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4049 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4050 const SmallVectorImpl<ISD::OutputArg> &Outs,
4051 const SmallVectorImpl<SDValue> &OutVals,
4052 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4053 if (!mayTailCallThisCC(CalleeCC))
4056 // If -tailcallopt is specified, make fastcc functions tail-callable.
4057 MachineFunction &MF = DAG.getMachineFunction();
4058 const Function &CallerF = MF.getFunction();
4060 // If the function return type is x86_fp80 and the callee return type is not,
4061 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4062 // perform a tailcall optimization here.
4063 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4066 CallingConv::ID CallerCC = CallerF.getCallingConv();
4067 bool CCMatch = CallerCC == CalleeCC;
4068 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4069 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4071 // Win64 functions have extra shadow space for argument homing. Don't do the
4072 // sibcall if the caller and callee have mismatched expectations for this
4074 if (IsCalleeWin64 != IsCallerWin64)
4077 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4078 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4083 // Look for obvious safe cases to perform tail call optimization that do not
4084 // require ABI changes. This is what gcc calls sibcall.
4086 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4087 // emit a special epilogue.
4088 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4089 if (RegInfo->needsStackRealignment(MF))
4092 // Also avoid sibcall optimization if either caller or callee uses struct
4093 // return semantics.
4094 if (isCalleeStructRet || isCallerStructRet)
4097 // Do not sibcall optimize vararg calls unless all arguments are passed via
4099 LLVMContext &C = *DAG.getContext();
4100 if (isVarArg && !Outs.empty()) {
4101 // Optimizing for varargs on Win64 is unlikely to be safe without
4102 // additional testing.
4103 if (IsCalleeWin64 || IsCallerWin64)
4106 SmallVector<CCValAssign, 16> ArgLocs;
4107 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4109 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4110 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4111 if (!ArgLocs[i].isRegLoc())
4115 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4116 // stack. Therefore, if it's not used by the call it is not safe to optimize
4117 // this into a sibcall.
4118 bool Unused = false;
4119 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4126 SmallVector<CCValAssign, 16> RVLocs;
4127 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4128 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4129 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4130 CCValAssign &VA = RVLocs[i];
4131 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4136 // Check that the call results are passed in the same way.
4137 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4138 RetCC_X86, RetCC_X86))
4140 // The callee has to preserve all registers the caller needs to preserve.
4141 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4142 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4144 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4145 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4149 unsigned StackArgsSize = 0;
4151 // If the callee takes no arguments then go on to check the results of the
4153 if (!Outs.empty()) {
4154 // Check if stack adjustment is needed. For now, do not do this if any
4155 // argument is passed on the stack.
4156 SmallVector<CCValAssign, 16> ArgLocs;
4157 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4159 // Allocate shadow area for Win64
4161 CCInfo.AllocateStack(32, 8);
4163 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4164 StackArgsSize = CCInfo.getNextStackOffset();
4166 if (CCInfo.getNextStackOffset()) {
4167 // Check if the arguments are already laid out in the right way as
4168 // the caller's fixed stack objects.
4169 MachineFrameInfo &MFI = MF.getFrameInfo();
4170 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4171 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4172 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4173 CCValAssign &VA = ArgLocs[i];
4174 SDValue Arg = OutVals[i];
4175 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4176 if (VA.getLocInfo() == CCValAssign::Indirect)
4178 if (!VA.isRegLoc()) {
4179 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4186 bool PositionIndependent = isPositionIndependent();
4187 // If the tailcall address may be in a register, then make sure it's
4188 // possible to register allocate for it. In 32-bit, the call address can
4189 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4190 // callee-saved registers are restored. These happen to be the same
4191 // registers used to pass 'inreg' arguments so watch out for those.
4192 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4193 !isa<ExternalSymbolSDNode>(Callee)) ||
4194 PositionIndependent)) {
4195 unsigned NumInRegs = 0;
4196 // In PIC we need an extra register to formulate the address computation
4198 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4200 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4201 CCValAssign &VA = ArgLocs[i];
4204 unsigned Reg = VA.getLocReg();
4207 case X86::EAX: case X86::EDX: case X86::ECX:
4208 if (++NumInRegs == MaxInRegs)
4215 const MachineRegisterInfo &MRI = MF.getRegInfo();
4216 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4220 bool CalleeWillPop =
4221 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4222 MF.getTarget().Options.GuaranteedTailCallOpt);
4224 if (unsigned BytesToPop =
4225 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4226 // If we have bytes to pop, the callee must pop them.
4227 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4228 if (!CalleePopMatches)
4230 } else if (CalleeWillPop && StackArgsSize > 0) {
4231 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4239 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4240 const TargetLibraryInfo *libInfo) const {
4241 return X86::createFastISel(funcInfo, libInfo);
4244 //===----------------------------------------------------------------------===//
4245 // Other Lowering Hooks
4246 //===----------------------------------------------------------------------===//
4248 static bool MayFoldLoad(SDValue Op) {
4249 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4252 static bool MayFoldIntoStore(SDValue Op) {
4253 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4256 static bool MayFoldIntoZeroExtend(SDValue Op) {
4257 if (Op.hasOneUse()) {
4258 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4259 return (ISD::ZERO_EXTEND == Opcode);
4264 static bool isTargetShuffle(unsigned Opcode) {
4266 default: return false;
4267 case X86ISD::BLENDI:
4268 case X86ISD::PSHUFB:
4269 case X86ISD::PSHUFD:
4270 case X86ISD::PSHUFHW:
4271 case X86ISD::PSHUFLW:
4273 case X86ISD::INSERTPS:
4274 case X86ISD::EXTRQI:
4275 case X86ISD::INSERTQI:
4276 case X86ISD::PALIGNR:
4277 case X86ISD::VSHLDQ:
4278 case X86ISD::VSRLDQ:
4279 case X86ISD::MOVLHPS:
4280 case X86ISD::MOVHLPS:
4281 case X86ISD::MOVLPS:
4282 case X86ISD::MOVLPD:
4283 case X86ISD::MOVSHDUP:
4284 case X86ISD::MOVSLDUP:
4285 case X86ISD::MOVDDUP:
4288 case X86ISD::UNPCKL:
4289 case X86ISD::UNPCKH:
4290 case X86ISD::VBROADCAST:
4291 case X86ISD::VPERMILPI:
4292 case X86ISD::VPERMILPV:
4293 case X86ISD::VPERM2X128:
4294 case X86ISD::VPERMIL2:
4295 case X86ISD::VPERMI:
4296 case X86ISD::VPPERM:
4297 case X86ISD::VPERMV:
4298 case X86ISD::VPERMV3:
4299 case X86ISD::VPERMIV3:
4300 case X86ISD::VZEXT_MOVL:
4305 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4307 default: return false;
4309 case X86ISD::PSHUFB:
4310 case X86ISD::VPERMILPV:
4311 case X86ISD::VPERMIL2:
4312 case X86ISD::VPPERM:
4313 case X86ISD::VPERMV:
4314 case X86ISD::VPERMV3:
4315 case X86ISD::VPERMIV3:
4317 // 'Faux' Target Shuffles.
4324 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4325 MachineFunction &MF = DAG.getMachineFunction();
4326 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4327 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4328 int ReturnAddrIndex = FuncInfo->getRAIndex();
4330 if (ReturnAddrIndex == 0) {
4331 // Set up a frame object for the return address.
4332 unsigned SlotSize = RegInfo->getSlotSize();
4333 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4336 FuncInfo->setRAIndex(ReturnAddrIndex);
4339 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4342 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4343 bool hasSymbolicDisplacement) {
4344 // Offset should fit into 32 bit immediate field.
4345 if (!isInt<32>(Offset))
4348 // If we don't have a symbolic displacement - we don't have any extra
4350 if (!hasSymbolicDisplacement)
4353 // FIXME: Some tweaks might be needed for medium code model.
4354 if (M != CodeModel::Small && M != CodeModel::Kernel)
4357 // For small code model we assume that latest object is 16MB before end of 31
4358 // bits boundary. We may also accept pretty large negative constants knowing
4359 // that all objects are in the positive half of address space.
4360 if (M == CodeModel::Small && Offset < 16*1024*1024)
4363 // For kernel code model we know that all object resist in the negative half
4364 // of 32bits address space. We may not accept negative offsets, since they may
4365 // be just off and we may accept pretty large positive ones.
4366 if (M == CodeModel::Kernel && Offset >= 0)
4372 /// Determines whether the callee is required to pop its own arguments.
4373 /// Callee pop is necessary to support tail calls.
4374 bool X86::isCalleePop(CallingConv::ID CallingConv,
4375 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4376 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4377 // can guarantee TCO.
4378 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4381 switch (CallingConv) {
4384 case CallingConv::X86_StdCall:
4385 case CallingConv::X86_FastCall:
4386 case CallingConv::X86_ThisCall:
4387 case CallingConv::X86_VectorCall:
4392 /// \brief Return true if the condition is an unsigned comparison operation.
4393 static bool isX86CCUnsigned(unsigned X86CC) {
4396 llvm_unreachable("Invalid integer condition!");
4412 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4413 switch (SetCCOpcode) {
4414 default: llvm_unreachable("Invalid integer condition!");
4415 case ISD::SETEQ: return X86::COND_E;
4416 case ISD::SETGT: return X86::COND_G;
4417 case ISD::SETGE: return X86::COND_GE;
4418 case ISD::SETLT: return X86::COND_L;
4419 case ISD::SETLE: return X86::COND_LE;
4420 case ISD::SETNE: return X86::COND_NE;
4421 case ISD::SETULT: return X86::COND_B;
4422 case ISD::SETUGT: return X86::COND_A;
4423 case ISD::SETULE: return X86::COND_BE;
4424 case ISD::SETUGE: return X86::COND_AE;
4428 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4429 /// condition code, returning the condition code and the LHS/RHS of the
4430 /// comparison to make.
4431 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4432 bool isFP, SDValue &LHS, SDValue &RHS,
4433 SelectionDAG &DAG) {
4435 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4436 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4437 // X > -1 -> X == 0, jump !sign.
4438 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4439 return X86::COND_NS;
4441 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4442 // X < 0 -> X == 0, jump on sign.
4445 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4447 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4448 return X86::COND_LE;
4452 return TranslateIntegerX86CC(SetCCOpcode);
4455 // First determine if it is required or is profitable to flip the operands.
4457 // If LHS is a foldable load, but RHS is not, flip the condition.
4458 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4459 !ISD::isNON_EXTLoad(RHS.getNode())) {
4460 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4461 std::swap(LHS, RHS);
4464 switch (SetCCOpcode) {
4470 std::swap(LHS, RHS);
4474 // On a floating point condition, the flags are set as follows:
4476 // 0 | 0 | 0 | X > Y
4477 // 0 | 0 | 1 | X < Y
4478 // 1 | 0 | 0 | X == Y
4479 // 1 | 1 | 1 | unordered
4480 switch (SetCCOpcode) {
4481 default: llvm_unreachable("Condcode should be pre-legalized away");
4483 case ISD::SETEQ: return X86::COND_E;
4484 case ISD::SETOLT: // flipped
4486 case ISD::SETGT: return X86::COND_A;
4487 case ISD::SETOLE: // flipped
4489 case ISD::SETGE: return X86::COND_AE;
4490 case ISD::SETUGT: // flipped
4492 case ISD::SETLT: return X86::COND_B;
4493 case ISD::SETUGE: // flipped
4495 case ISD::SETLE: return X86::COND_BE;
4497 case ISD::SETNE: return X86::COND_NE;
4498 case ISD::SETUO: return X86::COND_P;
4499 case ISD::SETO: return X86::COND_NP;
4501 case ISD::SETUNE: return X86::COND_INVALID;
4505 /// Is there a floating point cmov for the specific X86 condition code?
4506 /// Current x86 isa includes the following FP cmov instructions:
4507 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4508 static bool hasFPCMov(unsigned X86CC) {
4525 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4527 MachineFunction &MF,
4528 unsigned Intrinsic) const {
4530 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4534 Info.opc = ISD::INTRINSIC_W_CHAIN;
4535 Info.flags = MachineMemOperand::MONone;
4538 switch (IntrData->Type) {
4539 case EXPAND_FROM_MEM: {
4540 Info.ptrVal = I.getArgOperand(0);
4541 Info.memVT = MVT::getVT(I.getType());
4543 Info.flags |= MachineMemOperand::MOLoad;
4546 case COMPRESS_TO_MEM: {
4547 Info.ptrVal = I.getArgOperand(0);
4548 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4550 Info.flags |= MachineMemOperand::MOStore;
4553 case TRUNCATE_TO_MEM_VI8:
4554 case TRUNCATE_TO_MEM_VI16:
4555 case TRUNCATE_TO_MEM_VI32: {
4556 Info.ptrVal = I.getArgOperand(0);
4557 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4558 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4559 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4561 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4562 ScalarVT = MVT::i16;
4563 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4564 ScalarVT = MVT::i32;
4566 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4568 Info.flags |= MachineMemOperand::MOStore;
4578 /// Returns true if the target can instruction select the
4579 /// specified FP immediate natively. If false, the legalizer will
4580 /// materialize the FP immediate as a load from a constant pool.
4581 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4582 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4583 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4589 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4590 ISD::LoadExtType ExtTy,
4592 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4593 // relocation target a movq or addq instruction: don't let the load shrink.
4594 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4595 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4596 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4597 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4601 /// \brief Returns true if it is beneficial to convert a load of a constant
4602 /// to just the constant itself.
4603 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4605 assert(Ty->isIntegerTy());
4607 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4608 if (BitSize == 0 || BitSize > 64)
4613 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4614 // TODO: It might be a win to ease or lift this restriction, but the generic
4615 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4616 if (VT.isVector() && Subtarget.hasAVX512())
4622 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4623 unsigned Index) const {
4624 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4627 // Mask vectors support all subregister combinations and operations that
4628 // extract half of vector.
4629 if (ResVT.getVectorElementType() == MVT::i1)
4630 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4631 (Index == ResVT.getVectorNumElements()));
4633 return (Index % ResVT.getVectorNumElements()) == 0;
4636 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4637 // Speculate cttz only if we can directly use TZCNT.
4638 return Subtarget.hasBMI();
4641 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4642 // Speculate ctlz only if we can directly use LZCNT.
4643 return Subtarget.hasLZCNT();
4646 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
4647 EVT BitcastVT) const {
4648 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
4651 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
4654 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4655 const SelectionDAG &DAG) const {
4656 // Do not merge to float value size (128 bytes) if no implicit
4657 // float attribute is set.
4658 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
4659 Attribute::NoImplicitFloat);
4662 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4663 return (MemVT.getSizeInBits() <= MaxIntSize);
4668 bool X86TargetLowering::isCtlzFast() const {
4669 return Subtarget.hasFastLZCNT();
4672 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4673 const Instruction &AndI) const {
4677 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4678 if (!Subtarget.hasBMI())
4681 // There are only 32-bit and 64-bit forms for 'andn'.
4682 EVT VT = Y.getValueType();
4683 if (VT != MVT::i32 && VT != MVT::i64)
4689 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4690 MVT VT = MVT::getIntegerVT(NumBits);
4691 if (isTypeLegal(VT))
4694 // PMOVMSKB can handle this.
4695 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4698 // VPMOVMSKB can handle this.
4699 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4702 // TODO: Allow 64-bit type for 32-bit target.
4703 // TODO: 512-bit types should be allowed, but make sure that those
4704 // cases are handled in combineVectorSizedSetCCEquality().
4706 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4709 /// Val is the undef sentinel value or equal to the specified value.
4710 static bool isUndefOrEqual(int Val, int CmpVal) {
4711 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4714 /// Val is either the undef or zero sentinel value.
4715 static bool isUndefOrZero(int Val) {
4716 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4719 /// Return true if every element in Mask, beginning
4720 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4721 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4722 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4723 if (Mask[i] != SM_SentinelUndef)
4728 /// Return true if Val is undef or if its value falls within the
4729 /// specified range (L, H].
4730 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4731 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4734 /// Return true if every element in Mask is undef or if its value
4735 /// falls within the specified range (L, H].
4736 static bool isUndefOrInRange(ArrayRef<int> Mask,
4739 if (!isUndefOrInRange(M, Low, Hi))
4744 /// Return true if Val is undef, zero or if its value falls within the
4745 /// specified range (L, H].
4746 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4747 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4750 /// Return true if every element in Mask is undef, zero or if its value
4751 /// falls within the specified range (L, H].
4752 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4754 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4759 /// Return true if every element in Mask, beginning
4760 /// from position Pos and ending in Pos+Size, falls within the specified
4761 /// sequential range (Low, Low+Size]. or is undef.
4762 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4763 unsigned Pos, unsigned Size, int Low) {
4764 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4765 if (!isUndefOrEqual(Mask[i], Low))
4770 /// Return true if every element in Mask, beginning
4771 /// from position Pos and ending in Pos+Size, falls within the specified
4772 /// sequential range (Low, Low+Size], or is undef or is zero.
4773 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4774 unsigned Size, int Low) {
4775 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4776 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4781 /// Return true if every element in Mask, beginning
4782 /// from position Pos and ending in Pos+Size is undef or is zero.
4783 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4785 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4786 if (!isUndefOrZero(Mask[i]))
4791 /// \brief Helper function to test whether a shuffle mask could be
4792 /// simplified by widening the elements being shuffled.
4794 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4795 /// leaves it in an unspecified state.
4797 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4798 /// shuffle masks. The latter have the special property of a '-2' representing
4799 /// a zero-ed lane of a vector.
4800 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4801 SmallVectorImpl<int> &WidenedMask) {
4802 WidenedMask.assign(Mask.size() / 2, 0);
4803 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4805 int M1 = Mask[i + 1];
4807 // If both elements are undef, its trivial.
4808 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4809 WidenedMask[i / 2] = SM_SentinelUndef;
4813 // Check for an undef mask and a mask value properly aligned to fit with
4814 // a pair of values. If we find such a case, use the non-undef mask's value.
4815 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4816 WidenedMask[i / 2] = M1 / 2;
4819 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4820 WidenedMask[i / 2] = M0 / 2;
4824 // When zeroing, we need to spread the zeroing across both lanes to widen.
4825 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4826 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4827 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4828 WidenedMask[i / 2] = SM_SentinelZero;
4834 // Finally check if the two mask values are adjacent and aligned with
4836 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4837 WidenedMask[i / 2] = M0 / 2;
4841 // Otherwise we can't safely widen the elements used in this shuffle.
4844 assert(WidenedMask.size() == Mask.size() / 2 &&
4845 "Incorrect size of mask after widening the elements!");
4850 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4851 bool X86::isZeroNode(SDValue Elt) {
4852 return isNullConstant(Elt) || isNullFPConstant(Elt);
4855 // Build a vector of constants.
4856 // Use an UNDEF node if MaskElt == -1.
4857 // Split 64-bit constants in the 32-bit mode.
4858 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4859 const SDLoc &dl, bool IsMask = false) {
4861 SmallVector<SDValue, 32> Ops;
4864 MVT ConstVecVT = VT;
4865 unsigned NumElts = VT.getVectorNumElements();
4866 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4867 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4868 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4872 MVT EltVT = ConstVecVT.getVectorElementType();
4873 for (unsigned i = 0; i < NumElts; ++i) {
4874 bool IsUndef = Values[i] < 0 && IsMask;
4875 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4876 DAG.getConstant(Values[i], dl, EltVT);
4877 Ops.push_back(OpNode);
4879 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4880 DAG.getConstant(0, dl, EltVT));
4882 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4884 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4888 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4889 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4890 assert(Bits.size() == Undefs.getBitWidth() &&
4891 "Unequal constant and undef arrays");
4892 SmallVector<SDValue, 32> Ops;
4895 MVT ConstVecVT = VT;
4896 unsigned NumElts = VT.getVectorNumElements();
4897 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4898 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4899 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4903 MVT EltVT = ConstVecVT.getVectorElementType();
4904 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4906 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4909 const APInt &V = Bits[i];
4910 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4912 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4913 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4914 } else if (EltVT == MVT::f32) {
4915 APFloat FV(APFloat::IEEEsingle(), V);
4916 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4917 } else if (EltVT == MVT::f64) {
4918 APFloat FV(APFloat::IEEEdouble(), V);
4919 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4921 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4925 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4926 return DAG.getBitcast(VT, ConstsNode);
4929 /// Returns a vector of specified type with all zero elements.
4930 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4931 SelectionDAG &DAG, const SDLoc &dl) {
4932 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4933 VT.getVectorElementType() == MVT::i1) &&
4934 "Unexpected vector type");
4936 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4937 // type. This ensures they get CSE'd. But if the integer type is not
4938 // available, use a floating-point +0.0 instead.
4940 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4941 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4942 } else if (VT.getVectorElementType() == MVT::i1) {
4943 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4944 "Unexpected vector type");
4945 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4946 "Unexpected vector type");
4947 Vec = DAG.getConstant(0, dl, VT);
4949 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4950 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4952 return DAG.getBitcast(VT, Vec);
4955 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4956 const SDLoc &dl, unsigned vectorWidth) {
4957 EVT VT = Vec.getValueType();
4958 EVT ElVT = VT.getVectorElementType();
4959 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4960 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4961 VT.getVectorNumElements()/Factor);
4963 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4964 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4965 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4967 // This is the index of the first element of the vectorWidth-bit chunk
4968 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4969 IdxVal &= ~(ElemsPerChunk - 1);
4971 // If the input is a buildvector just emit a smaller one.
4972 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4973 return DAG.getBuildVector(ResultVT, dl,
4974 Vec->ops().slice(IdxVal, ElemsPerChunk));
4976 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4977 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4980 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4981 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4982 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4983 /// instructions or a simple subregister reference. Idx is an index in the
4984 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4985 /// lowering EXTRACT_VECTOR_ELT operations easier.
4986 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4987 SelectionDAG &DAG, const SDLoc &dl) {
4988 assert((Vec.getValueType().is256BitVector() ||
4989 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4990 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4993 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4994 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4995 SelectionDAG &DAG, const SDLoc &dl) {
4996 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4997 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5000 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5001 SelectionDAG &DAG, const SDLoc &dl,
5002 unsigned vectorWidth) {
5003 assert((vectorWidth == 128 || vectorWidth == 256) &&
5004 "Unsupported vector width");
5005 // Inserting UNDEF is Result
5008 EVT VT = Vec.getValueType();
5009 EVT ElVT = VT.getVectorElementType();
5010 EVT ResultVT = Result.getValueType();
5012 // Insert the relevant vectorWidth bits.
5013 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5014 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5016 // This is the index of the first element of the vectorWidth-bit chunk
5017 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5018 IdxVal &= ~(ElemsPerChunk - 1);
5020 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5021 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5024 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5025 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5026 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5027 /// simple superregister reference. Idx is an index in the 128 bits
5028 /// we want. It need not be aligned to a 128-bit boundary. That makes
5029 /// lowering INSERT_VECTOR_ELT operations easier.
5030 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5031 SelectionDAG &DAG, const SDLoc &dl) {
5032 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5033 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5036 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5037 SelectionDAG &DAG, const SDLoc &dl) {
5038 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5039 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5042 // Return true if the instruction zeroes the unused upper part of the
5043 // destination and accepts mask.
5044 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5049 case X86ISD::TESTNM:
5050 case X86ISD::PCMPEQM:
5051 case X86ISD::PCMPGTM:
5054 case X86ISD::CMPM_RND:
5059 /// Insert i1-subvector to i1-vector.
5060 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5061 const X86Subtarget &Subtarget) {
5064 SDValue Vec = Op.getOperand(0);
5065 SDValue SubVec = Op.getOperand(1);
5066 SDValue Idx = Op.getOperand(2);
5068 if (!isa<ConstantSDNode>(Idx))
5071 // Inserting undef is a nop. We can just return the original vector.
5072 if (SubVec.isUndef())
5075 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5076 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5079 MVT OpVT = Op.getSimpleValueType();
5080 unsigned NumElems = OpVT.getVectorNumElements();
5082 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5084 // Extend to natively supported kshift.
5085 MVT WideOpVT = OpVT;
5086 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5087 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5089 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5091 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5092 // May need to promote to a legal type.
5093 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5094 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5096 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5099 MVT SubVecVT = SubVec.getSimpleValueType();
5100 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5102 assert(IdxVal + SubVecNumElems <= NumElems &&
5103 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5104 "Unexpected index value in INSERT_SUBVECTOR");
5106 SDValue Undef = DAG.getUNDEF(WideOpVT);
5109 // Zero lower bits of the Vec
5110 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5111 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5113 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5114 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5115 // Merge them together, SubVec should be zero extended.
5116 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5117 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5119 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5120 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5123 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5124 Undef, SubVec, ZeroIdx);
5126 if (Vec.isUndef()) {
5127 assert(IdxVal != 0 && "Unexpected index");
5128 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5129 DAG.getConstant(IdxVal, dl, MVT::i8));
5130 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5133 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5134 assert(IdxVal != 0 && "Unexpected index");
5135 NumElems = WideOpVT.getVectorNumElements();
5136 unsigned ShiftLeft = NumElems - SubVecNumElems;
5137 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5138 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5139 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5140 if (ShiftRight != 0)
5141 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5142 DAG.getConstant(ShiftRight, dl, MVT::i8));
5143 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5146 // Simple case when we put subvector in the upper part
5147 if (IdxVal + SubVecNumElems == NumElems) {
5148 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5149 DAG.getConstant(IdxVal, dl, MVT::i8));
5150 if (SubVecNumElems * 2 == NumElems) {
5151 // Special case, use legal zero extending insert_subvector. This allows
5152 // isel to opimitize when bits are known zero.
5153 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5154 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5155 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5158 // Otherwise use explicit shifts to zero the bits.
5159 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5160 Undef, Vec, ZeroIdx);
5161 NumElems = WideOpVT.getVectorNumElements();
5162 SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
5163 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5164 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5166 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5167 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5170 // Inserting into the middle is more complicated.
5172 NumElems = WideOpVT.getVectorNumElements();
5174 // Widen the vector if needed.
5175 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5176 // Move the current value of the bit to be replace to the lsbs.
5177 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5178 DAG.getConstant(IdxVal, dl, MVT::i8));
5179 // Xor with the new bit.
5180 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
5181 // Shift to MSB, filling bottom bits with 0.
5182 unsigned ShiftLeft = NumElems - SubVecNumElems;
5183 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
5184 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5185 // Shift to the final position, filling upper bits with 0.
5186 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5187 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5188 DAG.getConstant(ShiftRight, dl, MVT::i8));
5189 // Xor with original vector leaving the new value.
5190 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
5191 // Reduce to original width if needed.
5192 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5195 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5196 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5197 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5198 /// large BUILD_VECTORS.
5199 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5200 unsigned NumElems, SelectionDAG &DAG,
5202 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5203 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5206 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5207 unsigned NumElems, SelectionDAG &DAG,
5209 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5210 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5213 /// Returns a vector of specified type with all bits set.
5214 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5215 /// Then bitcast to their original type, ensuring they get CSE'd.
5216 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5217 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5218 "Expected a 128/256/512-bit vector type");
5220 APInt Ones = APInt::getAllOnesValue(32);
5221 unsigned NumElts = VT.getSizeInBits() / 32;
5222 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5223 return DAG.getBitcast(VT, Vec);
5226 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5227 SelectionDAG &DAG) {
5228 EVT InVT = In.getValueType();
5229 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5231 if (VT.is128BitVector() && InVT.is128BitVector())
5232 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5233 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5235 // For 256-bit vectors, we only need the lower (128-bit) input half.
5236 // For 512-bit vectors, we only need the lower input half or quarter.
5237 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5238 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5239 In = extractSubVector(In, 0, DAG, DL,
5240 std::max(128, (int)VT.getSizeInBits() / Scale));
5243 return DAG.getNode(Opc, DL, VT, In);
5246 /// Returns a vector_shuffle node for an unpackl operation.
5247 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5248 SDValue V1, SDValue V2) {
5249 SmallVector<int, 8> Mask;
5250 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5251 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5254 /// Returns a vector_shuffle node for an unpackh operation.
5255 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5256 SDValue V1, SDValue V2) {
5257 SmallVector<int, 8> Mask;
5258 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5259 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5262 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5263 /// This produces a shuffle where the low element of V2 is swizzled into the
5264 /// zero/undef vector, landing at element Idx.
5265 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5266 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5268 const X86Subtarget &Subtarget,
5269 SelectionDAG &DAG) {
5270 MVT VT = V2.getSimpleValueType();
5272 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5273 int NumElems = VT.getVectorNumElements();
5274 SmallVector<int, 16> MaskVec(NumElems);
5275 for (int i = 0; i != NumElems; ++i)
5276 // If this is the insertion idx, put the low elt of V2 here.
5277 MaskVec[i] = (i == Idx) ? NumElems : i;
5278 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5281 static SDValue peekThroughBitcasts(SDValue V) {
5282 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5283 V = V.getOperand(0);
5287 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5288 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5289 V.getOperand(0).hasOneUse())
5290 V = V.getOperand(0);
5294 static const Constant *getTargetConstantFromNode(SDValue Op) {
5295 Op = peekThroughBitcasts(Op);
5297 auto *Load = dyn_cast<LoadSDNode>(Op);
5301 SDValue Ptr = Load->getBasePtr();
5302 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5303 Ptr->getOpcode() == X86ISD::WrapperRIP)
5304 Ptr = Ptr->getOperand(0);
5306 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5307 if (!CNode || CNode->isMachineConstantPoolEntry())
5310 return dyn_cast<Constant>(CNode->getConstVal());
5313 // Extract raw constant bits from constant pools.
5314 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5316 SmallVectorImpl<APInt> &EltBits,
5317 bool AllowWholeUndefs = true,
5318 bool AllowPartialUndefs = true) {
5319 assert(EltBits.empty() && "Expected an empty EltBits vector");
5321 Op = peekThroughBitcasts(Op);
5323 EVT VT = Op.getValueType();
5324 unsigned SizeInBits = VT.getSizeInBits();
5325 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5326 unsigned NumElts = SizeInBits / EltSizeInBits;
5328 // Bitcast a source array of element bits to the target size.
5329 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5330 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5331 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5332 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5333 "Constant bit sizes don't match");
5335 // Don't split if we don't allow undef bits.
5336 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5337 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5340 // If we're already the right size, don't bother bitcasting.
5341 if (NumSrcElts == NumElts) {
5342 UndefElts = UndefSrcElts;
5343 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5347 // Extract all the undef/constant element data and pack into single bitsets.
5348 APInt UndefBits(SizeInBits, 0);
5349 APInt MaskBits(SizeInBits, 0);
5351 for (unsigned i = 0; i != NumSrcElts; ++i) {
5352 unsigned BitOffset = i * SrcEltSizeInBits;
5353 if (UndefSrcElts[i])
5354 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5355 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5358 // Split the undef/constant single bitset data into the target elements.
5359 UndefElts = APInt(NumElts, 0);
5360 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5362 for (unsigned i = 0; i != NumElts; ++i) {
5363 unsigned BitOffset = i * EltSizeInBits;
5364 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5366 // Only treat an element as UNDEF if all bits are UNDEF.
5367 if (UndefEltBits.isAllOnesValue()) {
5368 if (!AllowWholeUndefs)
5370 UndefElts.setBit(i);
5374 // If only some bits are UNDEF then treat them as zero (or bail if not
5376 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5379 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5380 EltBits[i] = Bits.getZExtValue();
5385 // Collect constant bits and insert into mask/undef bit masks.
5386 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5387 unsigned UndefBitIndex) {
5390 if (isa<UndefValue>(Cst)) {
5391 Undefs.setBit(UndefBitIndex);
5394 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5395 Mask = CInt->getValue();
5398 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5399 Mask = CFP->getValueAPF().bitcastToAPInt();
5407 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5408 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5409 return CastBitData(UndefSrcElts, SrcEltBits);
5412 // Extract scalar constant bits.
5413 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5414 APInt UndefSrcElts = APInt::getNullValue(1);
5415 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5416 return CastBitData(UndefSrcElts, SrcEltBits);
5419 // Extract constant bits from build vector.
5420 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5421 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5422 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5424 APInt UndefSrcElts(NumSrcElts, 0);
5425 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5426 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5427 const SDValue &Src = Op.getOperand(i);
5428 if (Src.isUndef()) {
5429 UndefSrcElts.setBit(i);
5432 auto *Cst = cast<ConstantSDNode>(Src);
5433 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5435 return CastBitData(UndefSrcElts, SrcEltBits);
5438 // Extract constant bits from constant pool vector.
5439 if (auto *Cst = getTargetConstantFromNode(Op)) {
5440 Type *CstTy = Cst->getType();
5441 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5444 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5445 unsigned NumSrcElts = CstTy->getVectorNumElements();
5447 APInt UndefSrcElts(NumSrcElts, 0);
5448 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5449 for (unsigned i = 0; i != NumSrcElts; ++i)
5450 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5454 return CastBitData(UndefSrcElts, SrcEltBits);
5457 // Extract constant bits from a broadcasted constant pool scalar.
5458 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5459 EltSizeInBits <= VT.getScalarSizeInBits()) {
5460 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5461 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5462 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5464 APInt UndefSrcElts(NumSrcElts, 0);
5465 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5466 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5467 if (UndefSrcElts[0])
5468 UndefSrcElts.setBits(0, NumSrcElts);
5469 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5470 return CastBitData(UndefSrcElts, SrcEltBits);
5475 // Extract a rematerialized scalar constant insertion.
5476 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5477 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5478 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5479 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5480 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5482 APInt UndefSrcElts(NumSrcElts, 0);
5483 SmallVector<APInt, 64> SrcEltBits;
5484 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5485 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5486 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5487 return CastBitData(UndefSrcElts, SrcEltBits);
5493 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5494 unsigned MaskEltSizeInBits,
5495 SmallVectorImpl<uint64_t> &RawMask) {
5497 SmallVector<APInt, 64> EltBits;
5499 // Extract the raw target constant bits.
5500 // FIXME: We currently don't support UNDEF bits or mask entries.
5501 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5502 EltBits, /* AllowWholeUndefs */ false,
5503 /* AllowPartialUndefs */ false))
5506 // Insert the extracted elements into the mask.
5507 for (APInt Elt : EltBits)
5508 RawMask.push_back(Elt.getZExtValue());
5513 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5514 /// Note: This ignores saturation, so inputs must be checked first.
5515 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
5517 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5518 unsigned NumElts = VT.getVectorNumElements();
5519 unsigned NumLanes = VT.getSizeInBits() / 128;
5520 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5521 unsigned Offset = Unary ? 0 : NumElts;
5523 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5524 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5525 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5526 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5527 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5531 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5532 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5533 /// operands in \p Ops, and returns true.
5534 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5535 /// IsUnary for shuffles which use a single input multiple times, and in those
5536 /// cases it will adjust the mask to only have indices within that single input.
5537 /// It is an error to call this with non-empty Mask/Ops vectors.
5538 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5539 SmallVectorImpl<SDValue> &Ops,
5540 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5541 unsigned NumElems = VT.getVectorNumElements();
5544 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5545 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5548 bool IsFakeUnary = false;
5549 switch(N->getOpcode()) {
5550 case X86ISD::BLENDI:
5551 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5552 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5553 ImmN = N->getOperand(N->getNumOperands()-1);
5554 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5555 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5558 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5559 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5560 ImmN = N->getOperand(N->getNumOperands()-1);
5561 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5562 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5564 case X86ISD::INSERTPS:
5565 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5566 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5567 ImmN = N->getOperand(N->getNumOperands()-1);
5568 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5569 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5571 case X86ISD::EXTRQI:
5572 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5573 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5574 isa<ConstantSDNode>(N->getOperand(2))) {
5575 int BitLen = N->getConstantOperandVal(1);
5576 int BitIdx = N->getConstantOperandVal(2);
5577 DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
5581 case X86ISD::INSERTQI:
5582 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5583 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5584 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5585 isa<ConstantSDNode>(N->getOperand(3))) {
5586 int BitLen = N->getConstantOperandVal(2);
5587 int BitIdx = N->getConstantOperandVal(3);
5588 DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
5589 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5592 case X86ISD::UNPCKH:
5593 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5594 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5595 DecodeUNPCKHMask(VT, Mask);
5596 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5598 case X86ISD::UNPCKL:
5599 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5600 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5601 DecodeUNPCKLMask(VT, Mask);
5602 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5604 case X86ISD::MOVHLPS:
5605 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5606 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5607 DecodeMOVHLPSMask(NumElems, Mask);
5608 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5610 case X86ISD::MOVLHPS:
5611 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5613 DecodeMOVLHPSMask(NumElems, Mask);
5614 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5616 case X86ISD::PALIGNR:
5617 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5618 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5619 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5620 ImmN = N->getOperand(N->getNumOperands()-1);
5621 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5622 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5623 Ops.push_back(N->getOperand(1));
5624 Ops.push_back(N->getOperand(0));
5626 case X86ISD::VSHLDQ:
5627 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5628 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5629 ImmN = N->getOperand(N->getNumOperands() - 1);
5630 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5633 case X86ISD::VSRLDQ:
5634 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5635 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5636 ImmN = N->getOperand(N->getNumOperands() - 1);
5637 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5640 case X86ISD::PSHUFD:
5641 case X86ISD::VPERMILPI:
5642 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5643 ImmN = N->getOperand(N->getNumOperands()-1);
5644 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5647 case X86ISD::PSHUFHW:
5648 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5649 ImmN = N->getOperand(N->getNumOperands()-1);
5650 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5653 case X86ISD::PSHUFLW:
5654 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5655 ImmN = N->getOperand(N->getNumOperands()-1);
5656 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5659 case X86ISD::VZEXT_MOVL:
5660 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5661 DecodeZeroMoveLowMask(VT, Mask);
5664 case X86ISD::VBROADCAST: {
5665 SDValue N0 = N->getOperand(0);
5666 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5667 // add the pre-extracted value to the Ops vector.
5668 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5669 N0.getOperand(0).getValueType() == VT &&
5670 N0.getConstantOperandVal(1) == 0)
5671 Ops.push_back(N0.getOperand(0));
5673 // We only decode broadcasts of same-sized vectors, unless the broadcast
5674 // came from an extract from the original width. If we found one, we
5675 // pushed it the Ops vector above.
5676 if (N0.getValueType() == VT || !Ops.empty()) {
5677 DecodeVectorBroadcast(VT, Mask);
5683 case X86ISD::VPERMILPV: {
5684 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5686 SDValue MaskNode = N->getOperand(1);
5687 unsigned MaskEltSize = VT.getScalarSizeInBits();
5688 SmallVector<uint64_t, 32> RawMask;
5689 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5690 DecodeVPERMILPMask(VT, RawMask, Mask);
5693 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5694 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5699 case X86ISD::PSHUFB: {
5700 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5701 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5702 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5704 SDValue MaskNode = N->getOperand(1);
5705 SmallVector<uint64_t, 32> RawMask;
5706 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5707 DecodePSHUFBMask(RawMask, Mask);
5710 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5711 DecodePSHUFBMask(C, Mask);
5716 case X86ISD::VPERMI:
5717 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5718 ImmN = N->getOperand(N->getNumOperands()-1);
5719 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5724 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5725 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5726 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5728 case X86ISD::VPERM2X128:
5729 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5730 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5731 ImmN = N->getOperand(N->getNumOperands()-1);
5732 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5733 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5735 case X86ISD::MOVSLDUP:
5736 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5737 DecodeMOVSLDUPMask(VT, Mask);
5740 case X86ISD::MOVSHDUP:
5741 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5742 DecodeMOVSHDUPMask(VT, Mask);
5745 case X86ISD::MOVDDUP:
5746 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5747 DecodeMOVDDUPMask(VT, Mask);
5750 case X86ISD::MOVLPD:
5751 case X86ISD::MOVLPS:
5752 // Not yet implemented
5754 case X86ISD::VPERMIL2: {
5755 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5756 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5757 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5758 unsigned MaskEltSize = VT.getScalarSizeInBits();
5759 SDValue MaskNode = N->getOperand(2);
5760 SDValue CtrlNode = N->getOperand(3);
5761 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5762 unsigned CtrlImm = CtrlOp->getZExtValue();
5763 SmallVector<uint64_t, 32> RawMask;
5764 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5765 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5768 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5769 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5775 case X86ISD::VPPERM: {
5776 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5777 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5778 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5779 SDValue MaskNode = N->getOperand(2);
5780 SmallVector<uint64_t, 32> RawMask;
5781 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5782 DecodeVPPERMMask(RawMask, Mask);
5785 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5786 DecodeVPPERMMask(C, Mask);
5791 case X86ISD::VPERMV: {
5792 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5794 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5795 Ops.push_back(N->getOperand(1));
5796 SDValue MaskNode = N->getOperand(0);
5797 SmallVector<uint64_t, 32> RawMask;
5798 unsigned MaskEltSize = VT.getScalarSizeInBits();
5799 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5800 DecodeVPERMVMask(RawMask, Mask);
5803 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5804 DecodeVPERMVMask(C, MaskEltSize, Mask);
5809 case X86ISD::VPERMV3: {
5810 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5811 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5812 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5813 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5814 Ops.push_back(N->getOperand(0));
5815 Ops.push_back(N->getOperand(2));
5816 SDValue MaskNode = N->getOperand(1);
5817 unsigned MaskEltSize = VT.getScalarSizeInBits();
5818 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5819 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5824 case X86ISD::VPERMIV3: {
5825 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5826 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5827 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5828 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5829 Ops.push_back(N->getOperand(1));
5830 Ops.push_back(N->getOperand(2));
5831 SDValue MaskNode = N->getOperand(0);
5832 unsigned MaskEltSize = VT.getScalarSizeInBits();
5833 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5834 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5839 default: llvm_unreachable("unknown target shuffle node");
5842 // Empty mask indicates the decode failed.
5846 // Check if we're getting a shuffle mask with zero'd elements.
5847 if (!AllowSentinelZero)
5848 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5851 // If we have a fake unary shuffle, the shuffle mask is spread across two
5852 // inputs that are actually the same node. Re-map the mask to always point
5853 // into the first input.
5856 if (M >= (int)Mask.size())
5859 // If we didn't already add operands in the opcode-specific code, default to
5860 // adding 1 or 2 operands starting at 0.
5862 Ops.push_back(N->getOperand(0));
5863 if (!IsUnary || IsFakeUnary)
5864 Ops.push_back(N->getOperand(1));
5870 /// Check a target shuffle mask's inputs to see if we can set any values to
5871 /// SM_SentinelZero - this is for elements that are known to be zero
5872 /// (not just zeroable) from their inputs.
5873 /// Returns true if the target shuffle mask was decoded.
5874 static bool setTargetShuffleZeroElements(SDValue N,
5875 SmallVectorImpl<int> &Mask,
5876 SmallVectorImpl<SDValue> &Ops) {
5878 if (!isTargetShuffle(N.getOpcode()))
5881 MVT VT = N.getSimpleValueType();
5882 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5885 SDValue V1 = Ops[0];
5886 SDValue V2 = IsUnary ? V1 : Ops[1];
5888 V1 = peekThroughBitcasts(V1);
5889 V2 = peekThroughBitcasts(V2);
5891 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5892 "Illegal split of shuffle value type");
5893 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5895 // Extract known constant input data.
5896 APInt UndefSrcElts[2];
5897 SmallVector<APInt, 32> SrcEltBits[2];
5898 bool IsSrcConstant[2] = {
5899 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5900 SrcEltBits[0], true, false),
5901 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5902 SrcEltBits[1], true, false)};
5904 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5907 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5911 // Determine shuffle input and normalize the mask.
5912 unsigned SrcIdx = M / Size;
5913 SDValue V = M < Size ? V1 : V2;
5916 // We are referencing an UNDEF input.
5918 Mask[i] = SM_SentinelUndef;
5922 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5923 // TODO: We currently only set UNDEF for integer types - floats use the same
5924 // registers as vectors and many of the scalar folded loads rely on the
5925 // SCALAR_TO_VECTOR pattern.
5926 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5927 (Size % V.getValueType().getVectorNumElements()) == 0) {
5928 int Scale = Size / V.getValueType().getVectorNumElements();
5929 int Idx = M / Scale;
5930 if (Idx != 0 && !VT.isFloatingPoint())
5931 Mask[i] = SM_SentinelUndef;
5932 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5933 Mask[i] = SM_SentinelZero;
5937 // Attempt to extract from the source's constant bits.
5938 if (IsSrcConstant[SrcIdx]) {
5939 if (UndefSrcElts[SrcIdx][M])
5940 Mask[i] = SM_SentinelUndef;
5941 else if (SrcEltBits[SrcIdx][M] == 0)
5942 Mask[i] = SM_SentinelZero;
5946 assert(VT.getVectorNumElements() == Mask.size() &&
5947 "Different mask size from vector size!");
5951 // Attempt to decode ops that could be represented as a shuffle mask.
5952 // The decoded shuffle mask may contain a different number of elements to the
5953 // destination value type.
5954 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5955 SmallVectorImpl<SDValue> &Ops,
5956 SelectionDAG &DAG) {
5960 MVT VT = N.getSimpleValueType();
5961 unsigned NumElts = VT.getVectorNumElements();
5962 unsigned NumSizeInBits = VT.getSizeInBits();
5963 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5964 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5965 "Expected byte aligned value types");
5967 unsigned Opcode = N.getOpcode();
5970 case X86ISD::ANDNP: {
5971 // Attempt to decode as a per-byte mask.
5973 SmallVector<APInt, 32> EltBits;
5974 SDValue N0 = N.getOperand(0);
5975 SDValue N1 = N.getOperand(1);
5976 bool IsAndN = (X86ISD::ANDNP == Opcode);
5977 uint64_t ZeroMask = IsAndN ? 255 : 0;
5978 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5980 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5982 Mask.push_back(SM_SentinelUndef);
5985 uint64_t ByteBits = EltBits[i].getZExtValue();
5986 if (ByteBits != 0 && ByteBits != 255)
5988 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5990 Ops.push_back(IsAndN ? N1 : N0);
5993 case ISD::SCALAR_TO_VECTOR: {
5994 // Match against a scalar_to_vector of an extract from a vector,
5995 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
5996 SDValue N0 = N.getOperand(0);
5999 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6000 N0.getOperand(0).getValueType() == VT) ||
6001 (N0.getOpcode() == X86ISD::PEXTRW &&
6002 N0.getOperand(0).getValueType() == MVT::v8i16) ||
6003 (N0.getOpcode() == X86ISD::PEXTRB &&
6004 N0.getOperand(0).getValueType() == MVT::v16i8)) {
6008 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6011 SDValue SrcVec = SrcExtract.getOperand(0);
6012 EVT SrcVT = SrcVec.getValueType();
6013 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6014 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
6016 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6017 if (NumSrcElts <= SrcIdx)
6020 Ops.push_back(SrcVec);
6021 Mask.push_back(SrcIdx);
6022 Mask.append(NumZeros, SM_SentinelZero);
6023 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6026 case X86ISD::PINSRB:
6027 case X86ISD::PINSRW: {
6028 SDValue InVec = N.getOperand(0);
6029 SDValue InScl = N.getOperand(1);
6030 uint64_t InIdx = N.getConstantOperandVal(2);
6031 assert(InIdx < NumElts && "Illegal insertion index");
6033 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6034 if (X86::isZeroNode(InScl)) {
6035 Ops.push_back(InVec);
6036 for (unsigned i = 0; i != NumElts; ++i)
6037 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6041 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
6042 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6044 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6045 if (InScl.getOpcode() != ExOp)
6048 SDValue ExVec = InScl.getOperand(0);
6049 uint64_t ExIdx = InScl.getConstantOperandVal(1);
6050 assert(ExIdx < NumElts && "Illegal extraction index");
6051 Ops.push_back(InVec);
6052 Ops.push_back(ExVec);
6053 for (unsigned i = 0; i != NumElts; ++i)
6054 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6057 case X86ISD::PACKSS:
6058 case X86ISD::PACKUS: {
6059 SDValue N0 = N.getOperand(0);
6060 SDValue N1 = N.getOperand(1);
6061 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6062 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6063 "Unexpected input value type");
6065 // If we know input saturation won't happen we can treat this
6066 // as a truncation shuffle.
6067 if (Opcode == X86ISD::PACKSS) {
6068 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6069 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6072 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6073 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6074 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6078 bool IsUnary = (N0 == N1);
6084 createPackShuffleMask(VT, Mask, IsUnary);
6088 case X86ISD::VSRLI: {
6089 uint64_t ShiftVal = N.getConstantOperandVal(1);
6090 // Out of range bit shifts are guaranteed to be zero.
6091 if (NumBitsPerElt <= ShiftVal) {
6092 Mask.append(NumElts, SM_SentinelZero);
6096 // We can only decode 'whole byte' bit shifts as shuffles.
6097 if ((ShiftVal % 8) != 0)
6100 uint64_t ByteShift = ShiftVal / 8;
6101 unsigned NumBytes = NumSizeInBits / 8;
6102 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6103 Ops.push_back(N.getOperand(0));
6105 // Clear mask to all zeros and insert the shifted byte indices.
6106 Mask.append(NumBytes, SM_SentinelZero);
6108 if (X86ISD::VSHLI == Opcode) {
6109 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6110 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6111 Mask[i + j] = i + j - ByteShift;
6113 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6114 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6115 Mask[i + j - ByteShift] = i + j;
6119 case ISD::ZERO_EXTEND_VECTOR_INREG:
6120 case X86ISD::VZEXT: {
6121 // TODO - add support for VPMOVZX with smaller input vector types.
6122 SDValue Src = N.getOperand(0);
6123 MVT SrcVT = Src.getSimpleValueType();
6124 if (NumSizeInBits != SrcVT.getSizeInBits())
6126 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6135 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6136 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6137 SmallVectorImpl<int> &Mask) {
6138 int MaskWidth = Mask.size();
6139 SmallVector<SDValue, 16> UsedInputs;
6140 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6141 int lo = UsedInputs.size() * MaskWidth;
6142 int hi = lo + MaskWidth;
6144 // Strip UNDEF input usage.
6145 if (Inputs[i].isUndef())
6147 if ((lo <= M) && (M < hi))
6148 M = SM_SentinelUndef;
6150 // Check for unused inputs.
6151 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6152 UsedInputs.push_back(Inputs[i]);
6159 Inputs = UsedInputs;
6162 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6163 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6164 /// remaining input indices in case we now have a unary shuffle and adjust the
6165 /// inputs accordingly.
6166 /// Returns true if the target shuffle mask was decoded.
6167 static bool resolveTargetShuffleInputs(SDValue Op,
6168 SmallVectorImpl<SDValue> &Inputs,
6169 SmallVectorImpl<int> &Mask,
6170 SelectionDAG &DAG) {
6171 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6172 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6175 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6179 /// Returns the scalar element that will make up the ith
6180 /// element of the result of the vector shuffle.
6181 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6184 return SDValue(); // Limit search depth.
6186 SDValue V = SDValue(N, 0);
6187 EVT VT = V.getValueType();
6188 unsigned Opcode = V.getOpcode();
6190 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6191 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6192 int Elt = SV->getMaskElt(Index);
6195 return DAG.getUNDEF(VT.getVectorElementType());
6197 unsigned NumElems = VT.getVectorNumElements();
6198 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6199 : SV->getOperand(1);
6200 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6203 // Recurse into target specific vector shuffles to find scalars.
6204 if (isTargetShuffle(Opcode)) {
6205 MVT ShufVT = V.getSimpleValueType();
6206 MVT ShufSVT = ShufVT.getVectorElementType();
6207 int NumElems = (int)ShufVT.getVectorNumElements();
6208 SmallVector<int, 16> ShuffleMask;
6209 SmallVector<SDValue, 16> ShuffleOps;
6212 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6215 int Elt = ShuffleMask[Index];
6216 if (Elt == SM_SentinelZero)
6217 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6218 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6219 if (Elt == SM_SentinelUndef)
6220 return DAG.getUNDEF(ShufSVT);
6222 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6223 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6224 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6228 // Actual nodes that may contain scalar elements
6229 if (Opcode == ISD::BITCAST) {
6230 V = V.getOperand(0);
6231 EVT SrcVT = V.getValueType();
6232 unsigned NumElems = VT.getVectorNumElements();
6234 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6238 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6239 return (Index == 0) ? V.getOperand(0)
6240 : DAG.getUNDEF(VT.getVectorElementType());
6242 if (V.getOpcode() == ISD::BUILD_VECTOR)
6243 return V.getOperand(Index);
6248 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6249 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6250 unsigned NumNonZero, unsigned NumZero,
6252 const X86Subtarget &Subtarget) {
6253 MVT VT = Op.getSimpleValueType();
6254 unsigned NumElts = VT.getVectorNumElements();
6255 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6256 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6257 "Illegal vector insertion");
6263 for (unsigned i = 0; i < NumElts; ++i) {
6264 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6268 // If the build vector contains zeros or our first insertion is not the
6269 // first index then insert into zero vector to break any register
6270 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6273 if (NumZero || 0 != i)
6274 V = getZeroVector(VT, Subtarget, DAG, dl);
6276 assert(0 == i && "Expected insertion into zero-index");
6277 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6278 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6279 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6280 V = DAG.getBitcast(VT, V);
6284 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6285 DAG.getIntPtrConstant(i, dl));
6291 /// Custom lower build_vector of v16i8.
6292 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6293 unsigned NumNonZero, unsigned NumZero,
6295 const X86Subtarget &Subtarget) {
6296 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6299 // SSE4.1 - use PINSRB to insert each byte directly.
6300 if (Subtarget.hasSSE41())
6301 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6308 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6309 for (unsigned i = 0; i < 16; ++i) {
6310 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6311 if (ThisIsNonZero && First) {
6313 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6315 V = DAG.getUNDEF(MVT::v8i16);
6320 // FIXME: Investigate extending to i32 instead of just i16.
6321 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6322 SDValue ThisElt, LastElt;
6323 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6324 if (LastIsNonZero) {
6326 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6328 if (ThisIsNonZero) {
6329 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6330 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6331 DAG.getConstant(8, dl, MVT::i8));
6333 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6339 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6340 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6341 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6342 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6343 V = DAG.getBitcast(MVT::v8i16, V);
6345 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6346 DAG.getIntPtrConstant(i / 2, dl));
6352 return DAG.getBitcast(MVT::v16i8, V);
6355 /// Custom lower build_vector of v8i16.
6356 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6357 unsigned NumNonZero, unsigned NumZero,
6359 const X86Subtarget &Subtarget) {
6360 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6363 // Use PINSRW to insert each byte directly.
6364 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6368 /// Custom lower build_vector of v4i32 or v4f32.
6369 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6370 const X86Subtarget &Subtarget) {
6371 // Find all zeroable elements.
6372 std::bitset<4> Zeroable;
6373 for (int i=0; i < 4; ++i) {
6374 SDValue Elt = Op->getOperand(i);
6375 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6377 assert(Zeroable.size() - Zeroable.count() > 1 &&
6378 "We expect at least two non-zero elements!");
6380 // We only know how to deal with build_vector nodes where elements are either
6381 // zeroable or extract_vector_elt with constant index.
6382 SDValue FirstNonZero;
6383 unsigned FirstNonZeroIdx;
6384 for (unsigned i=0; i < 4; ++i) {
6387 SDValue Elt = Op->getOperand(i);
6388 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6389 !isa<ConstantSDNode>(Elt.getOperand(1)))
6391 // Make sure that this node is extracting from a 128-bit vector.
6392 MVT VT = Elt.getOperand(0).getSimpleValueType();
6393 if (!VT.is128BitVector())
6395 if (!FirstNonZero.getNode()) {
6397 FirstNonZeroIdx = i;
6401 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6402 SDValue V1 = FirstNonZero.getOperand(0);
6403 MVT VT = V1.getSimpleValueType();
6405 // See if this build_vector can be lowered as a blend with zero.
6407 unsigned EltMaskIdx, EltIdx;
6409 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6410 if (Zeroable[EltIdx]) {
6411 // The zero vector will be on the right hand side.
6412 Mask[EltIdx] = EltIdx+4;
6416 Elt = Op->getOperand(EltIdx);
6417 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6418 EltMaskIdx = Elt.getConstantOperandVal(1);
6419 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6421 Mask[EltIdx] = EltIdx;
6425 // Let the shuffle legalizer deal with blend operations.
6426 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6427 if (V1.getSimpleValueType() != VT)
6428 V1 = DAG.getBitcast(VT, V1);
6429 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6432 // See if we can lower this build_vector to a INSERTPS.
6433 if (!Subtarget.hasSSE41())
6436 SDValue V2 = Elt.getOperand(0);
6437 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6440 bool CanFold = true;
6441 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6445 SDValue Current = Op->getOperand(i);
6446 SDValue SrcVector = Current->getOperand(0);
6449 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6455 assert(V1.getNode() && "Expected at least two non-zero elements!");
6456 if (V1.getSimpleValueType() != MVT::v4f32)
6457 V1 = DAG.getBitcast(MVT::v4f32, V1);
6458 if (V2.getSimpleValueType() != MVT::v4f32)
6459 V2 = DAG.getBitcast(MVT::v4f32, V2);
6461 // Ok, we can emit an INSERTPS instruction.
6462 unsigned ZMask = Zeroable.to_ulong();
6464 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6465 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6467 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6468 DAG.getIntPtrConstant(InsertPSMask, DL));
6469 return DAG.getBitcast(VT, Result);
6472 /// Return a vector logical shift node.
6473 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6474 SelectionDAG &DAG, const TargetLowering &TLI,
6476 assert(VT.is128BitVector() && "Unknown type for VShift");
6477 MVT ShVT = MVT::v16i8;
6478 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6479 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6480 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6481 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6482 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6483 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6486 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6487 SelectionDAG &DAG) {
6489 // Check if the scalar load can be widened into a vector load. And if
6490 // the address is "base + cst" see if the cst can be "absorbed" into
6491 // the shuffle mask.
6492 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6493 SDValue Ptr = LD->getBasePtr();
6494 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6496 EVT PVT = LD->getValueType(0);
6497 if (PVT != MVT::i32 && PVT != MVT::f32)
6502 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6503 FI = FINode->getIndex();
6505 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6506 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6507 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6508 Offset = Ptr.getConstantOperandVal(1);
6509 Ptr = Ptr.getOperand(0);
6514 // FIXME: 256-bit vector instructions don't require a strict alignment,
6515 // improve this code to support it better.
6516 unsigned RequiredAlign = VT.getSizeInBits()/8;
6517 SDValue Chain = LD->getChain();
6518 // Make sure the stack object alignment is at least 16 or 32.
6519 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6520 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6521 if (MFI.isFixedObjectIndex(FI)) {
6522 // Can't change the alignment. FIXME: It's possible to compute
6523 // the exact stack offset and reference FI + adjust offset instead.
6524 // If someone *really* cares about this. That's the way to implement it.
6527 MFI.setObjectAlignment(FI, RequiredAlign);
6531 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6532 // Ptr + (Offset & ~15).
6535 if ((Offset % RequiredAlign) & 3)
6537 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6540 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6541 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6544 int EltNo = (Offset - StartOffset) >> 2;
6545 unsigned NumElems = VT.getVectorNumElements();
6547 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6548 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6549 LD->getPointerInfo().getWithOffset(StartOffset));
6551 SmallVector<int, 8> Mask(NumElems, EltNo);
6553 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6559 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6560 /// elements can be replaced by a single large load which has the same value as
6561 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6563 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6564 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6565 const SDLoc &DL, SelectionDAG &DAG,
6566 const X86Subtarget &Subtarget,
6567 bool isAfterLegalize) {
6568 unsigned NumElems = Elts.size();
6570 int LastLoadedElt = -1;
6571 SmallBitVector LoadMask(NumElems, false);
6572 SmallBitVector ZeroMask(NumElems, false);
6573 SmallBitVector UndefMask(NumElems, false);
6575 // For each element in the initializer, see if we've found a load, zero or an
6577 for (unsigned i = 0; i < NumElems; ++i) {
6578 SDValue Elt = peekThroughBitcasts(Elts[i]);
6583 UndefMask[i] = true;
6584 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6586 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6589 // Each loaded element must be the correct fractional portion of the
6590 // requested vector load.
6591 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6596 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6597 "Incomplete element masks");
6599 // Handle Special Cases - all undef or undef/zero.
6600 if (UndefMask.count() == NumElems)
6601 return DAG.getUNDEF(VT);
6603 // FIXME: Should we return this as a BUILD_VECTOR instead?
6604 if ((ZeroMask | UndefMask).count() == NumElems)
6605 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6606 : DAG.getConstantFP(0.0, DL, VT);
6608 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6609 int FirstLoadedElt = LoadMask.find_first();
6610 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6611 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6612 EVT LDBaseVT = EltBase.getValueType();
6614 // Consecutive loads can contain UNDEFS but not ZERO elements.
6615 // Consecutive loads with UNDEFs and ZEROs elements require a
6616 // an additional shuffle stage to clear the ZERO elements.
6617 bool IsConsecutiveLoad = true;
6618 bool IsConsecutiveLoadWithZeros = true;
6619 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6621 SDValue Elt = peekThroughBitcasts(Elts[i]);
6622 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6623 if (!DAG.areNonVolatileConsecutiveLoads(
6624 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6625 i - FirstLoadedElt)) {
6626 IsConsecutiveLoad = false;
6627 IsConsecutiveLoadWithZeros = false;
6630 } else if (ZeroMask[i]) {
6631 IsConsecutiveLoad = false;
6635 SmallVector<LoadSDNode *, 8> Loads;
6636 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
6638 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
6640 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6641 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6642 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6643 "Cannot merge volatile loads.");
6645 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6646 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6647 for (auto *LD : Loads)
6648 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6652 // LOAD - all consecutive load/undefs (must start/end with a load).
6653 // If we have found an entire vector of loads and undefs, then return a large
6654 // load of the entire vector width starting at the base pointer.
6655 // If the vector contains zeros, then attempt to shuffle those elements.
6656 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6657 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6658 assert(LDBase && "Did not find base load for merging consecutive loads");
6659 EVT EltVT = LDBase->getValueType(0);
6660 // Ensure that the input vector size for the merged loads matches the
6661 // cumulative size of the input elements.
6662 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6665 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6668 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6669 // will lower to regular temporal loads and use the cache.
6670 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6671 VT.is256BitVector() && !Subtarget.hasInt256())
6674 if (IsConsecutiveLoad)
6675 return CreateLoad(VT, LDBase);
6677 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6678 // vector and a zero vector to clear out the zero elements.
6679 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6680 SmallVector<int, 4> ClearMask(NumElems, -1);
6681 for (unsigned i = 0; i < NumElems; ++i) {
6683 ClearMask[i] = i + NumElems;
6684 else if (LoadMask[i])
6687 SDValue V = CreateLoad(VT, LDBase);
6688 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6689 : DAG.getConstantFP(0.0, DL, VT);
6690 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6695 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6697 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6698 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6699 (LoadSize == 32 || LoadSize == 64) &&
6700 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6701 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6702 : MVT::getIntegerVT(LoadSize);
6703 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6704 if (TLI.isTypeLegal(VecVT)) {
6705 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6706 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6708 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6709 LDBase->getPointerInfo(),
6710 LDBase->getAlignment(),
6711 MachineMemOperand::MOLoad);
6712 for (auto *LD : Loads)
6713 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6714 return DAG.getBitcast(VT, ResNode);
6721 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6722 unsigned SplatBitSize, LLVMContext &C) {
6723 unsigned ScalarSize = VT.getScalarSizeInBits();
6724 unsigned NumElm = SplatBitSize / ScalarSize;
6726 SmallVector<Constant *, 32> ConstantVec;
6727 for (unsigned i = 0; i < NumElm; i++) {
6728 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6730 if (VT.isFloatingPoint()) {
6731 if (ScalarSize == 32) {
6732 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6734 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6735 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6738 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6739 ConstantVec.push_back(Const);
6741 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6744 static bool isUseOfShuffle(SDNode *N) {
6745 for (auto *U : N->uses()) {
6746 if (isTargetShuffle(U->getOpcode()))
6748 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6749 return isUseOfShuffle(U);
6754 // Check if the current node of build vector is a zero extended vector.
6755 // // If so, return the value extended.
6756 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
6757 // // NumElt - return the number of zero extended identical values.
6758 // // EltType - return the type of the value include the zero extend.
6759 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
6760 unsigned &NumElt, MVT &EltType) {
6761 SDValue ExtValue = Op->getOperand(0);
6762 unsigned NumElts = Op->getNumOperands();
6763 unsigned Delta = NumElts;
6765 for (unsigned i = 1; i < NumElts; i++) {
6766 if (Op->getOperand(i) == ExtValue) {
6770 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
6773 if (!isPowerOf2_32(Delta) || Delta == 1)
6776 for (unsigned i = Delta; i < NumElts; i++) {
6777 if (i % Delta == 0) {
6778 if (Op->getOperand(i) != ExtValue)
6780 } else if (!(isNullConstant(Op->getOperand(i)) ||
6781 Op->getOperand(i).isUndef()))
6784 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
6785 unsigned ExtVTSize = EltSize * Delta;
6786 EltType = MVT::getIntegerVT(ExtVTSize);
6787 NumElt = NumElts / Delta;
6791 /// Attempt to use the vbroadcast instruction to generate a splat value
6792 /// from a splat BUILD_VECTOR which uses:
6793 /// a. A single scalar load, or a constant.
6794 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6796 /// The VBROADCAST node is returned when a pattern is found,
6797 /// or SDValue() otherwise.
6798 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6799 const X86Subtarget &Subtarget,
6800 SelectionDAG &DAG) {
6801 // VBROADCAST requires AVX.
6802 // TODO: Splats could be generated for non-AVX CPUs using SSE
6803 // instructions, but there's less potential gain for only 128-bit vectors.
6804 if (!Subtarget.hasAVX())
6807 MVT VT = BVOp->getSimpleValueType(0);
6810 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6811 "Unsupported vector type for broadcast.");
6813 BitVector UndefElements;
6814 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6816 // Attempt to use VBROADCASTM
6817 // From this paterrn:
6818 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
6819 // b. t1 = (build_vector t0 t0)
6821 // Create (VBROADCASTM v2i1 X)
6822 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
6823 MVT EltType = VT.getScalarType();
6824 unsigned NumElts = VT.getVectorNumElements();
6826 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
6827 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
6828 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
6829 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
6831 BOperand = ZeroExtended.getOperand(0);
6833 BOperand = Ld.getOperand(0).getOperand(0);
6834 if (BOperand.getValueType().isVector() &&
6835 BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
6836 if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
6837 NumElts == 8)) || // for broadcastmb2q
6838 (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
6839 NumElts == 16))) { // for broadcastmw2d
6841 DAG.getNode(X86ISD::VBROADCASTM, dl,
6842 MVT::getVectorVT(EltType, NumElts), BOperand);
6843 return DAG.getBitcast(VT, Brdcst);
6849 // We need a splat of a single value to use broadcast, and it doesn't
6850 // make any sense if the value is only in one element of the vector.
6851 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6852 APInt SplatValue, Undef;
6853 unsigned SplatBitSize;
6855 // Check if this is a repeated constant pattern suitable for broadcasting.
6856 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6857 SplatBitSize > VT.getScalarSizeInBits() &&
6858 SplatBitSize < VT.getSizeInBits()) {
6859 // Avoid replacing with broadcast when it's a use of a shuffle
6860 // instruction to preserve the present custom lowering of shuffles.
6861 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6863 // replace BUILD_VECTOR with broadcast of the repeated constants.
6864 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6865 LLVMContext *Ctx = DAG.getContext();
6866 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6867 if (Subtarget.hasAVX()) {
6868 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6869 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6870 // Splatted value can fit in one INTEGER constant in constant pool.
6871 // Load the constant and broadcast it.
6872 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6873 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6874 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6875 SDValue CP = DAG.getConstantPool(C, PVT);
6876 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6878 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6880 CVT, dl, DAG.getEntryNode(), CP,
6881 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6883 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6884 MVT::getVectorVT(CVT, Repeat), Ld);
6885 return DAG.getBitcast(VT, Brdcst);
6886 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6887 // Splatted value can fit in one FLOAT constant in constant pool.
6888 // Load the constant and broadcast it.
6889 // AVX have support for 32 and 64 bit broadcast for floats only.
6890 // No 64bit integer in 32bit subtarget.
6891 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6892 // Lower the splat via APFloat directly, to avoid any conversion.
6895 ? ConstantFP::get(*Ctx,
6896 APFloat(APFloat::IEEEsingle(), SplatValue))
6897 : ConstantFP::get(*Ctx,
6898 APFloat(APFloat::IEEEdouble(), SplatValue));
6899 SDValue CP = DAG.getConstantPool(C, PVT);
6900 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6902 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6904 CVT, dl, DAG.getEntryNode(), CP,
6905 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6907 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6908 MVT::getVectorVT(CVT, Repeat), Ld);
6909 return DAG.getBitcast(VT, Brdcst);
6910 } else if (SplatBitSize > 64) {
6911 // Load the vector of constants and broadcast it.
6912 MVT CVT = VT.getScalarType();
6913 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6915 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6916 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6917 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6919 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6920 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6922 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6923 return DAG.getBitcast(VT, Brdcst);
6930 bool ConstSplatVal =
6931 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6933 // Make sure that all of the users of a non-constant load are from the
6934 // BUILD_VECTOR node.
6935 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6938 unsigned ScalarSize = Ld.getValueSizeInBits();
6939 bool IsGE256 = (VT.getSizeInBits() >= 256);
6941 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6942 // instruction to save 8 or more bytes of constant pool data.
6943 // TODO: If multiple splats are generated to load the same constant,
6944 // it may be detrimental to overall size. There needs to be a way to detect
6945 // that condition to know if this is truly a size win.
6946 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
6948 // Handle broadcasting a single constant scalar from the constant pool
6950 // On Sandybridge (no AVX2), it is still better to load a constant vector
6951 // from the constant pool and not to broadcast it from a scalar.
6952 // But override that restriction when optimizing for size.
6953 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6954 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6955 EVT CVT = Ld.getValueType();
6956 assert(!CVT.isVector() && "Must not broadcast a vector type");
6958 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6959 // For size optimization, also splat v2f64 and v2i64, and for size opt
6960 // with AVX2, also splat i8 and i16.
6961 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6962 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6963 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6964 const Constant *C = nullptr;
6965 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6966 C = CI->getConstantIntValue();
6967 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6968 C = CF->getConstantFPValue();
6970 assert(C && "Invalid constant type");
6972 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6974 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6975 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6977 CVT, dl, DAG.getEntryNode(), CP,
6978 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6981 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6985 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6987 // Handle AVX2 in-register broadcasts.
6988 if (!IsLoad && Subtarget.hasInt256() &&
6989 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6990 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6992 // The scalar source must be a normal load.
6996 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6997 (Subtarget.hasVLX() && ScalarSize == 64))
6998 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7000 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7001 // double since there is no vbroadcastsd xmm
7002 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
7003 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
7004 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7007 // Unsupported broadcast.
7011 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
7012 /// underlying vector and index.
7014 /// Modifies \p ExtractedFromVec to the real vector and returns the real
7016 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7018 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7019 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7022 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7024 // (extract_vector_elt (v8f32 %1), Constant<6>)
7026 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7027 // (extract_subvector (v8f32 %0), Constant<4>),
7030 // In this case the vector is the extract_subvector expression and the index
7031 // is 2, as specified by the shuffle.
7032 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7033 SDValue ShuffleVec = SVOp->getOperand(0);
7034 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7035 assert(ShuffleVecVT.getVectorElementType() ==
7036 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7038 int ShuffleIdx = SVOp->getMaskElt(Idx);
7039 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7040 ExtractedFromVec = ShuffleVec;
7046 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7047 MVT VT = Op.getSimpleValueType();
7049 // Skip if insert_vec_elt is not supported.
7050 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7051 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7055 unsigned NumElems = Op.getNumOperands();
7059 SmallVector<unsigned, 4> InsertIndices;
7060 SmallVector<int, 8> Mask(NumElems, -1);
7062 for (unsigned i = 0; i != NumElems; ++i) {
7063 unsigned Opc = Op.getOperand(i).getOpcode();
7065 if (Opc == ISD::UNDEF)
7068 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7069 // Quit if more than 1 elements need inserting.
7070 if (InsertIndices.size() > 1)
7073 InsertIndices.push_back(i);
7077 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7078 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7080 // Quit if non-constant index.
7081 if (!isa<ConstantSDNode>(ExtIdx))
7083 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7085 // Quit if extracted from vector of different type.
7086 if (ExtractedFromVec.getValueType() != VT)
7089 if (!VecIn1.getNode())
7090 VecIn1 = ExtractedFromVec;
7091 else if (VecIn1 != ExtractedFromVec) {
7092 if (!VecIn2.getNode())
7093 VecIn2 = ExtractedFromVec;
7094 else if (VecIn2 != ExtractedFromVec)
7095 // Quit if more than 2 vectors to shuffle
7099 if (ExtractedFromVec == VecIn1)
7101 else if (ExtractedFromVec == VecIn2)
7102 Mask[i] = Idx + NumElems;
7105 if (!VecIn1.getNode())
7108 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7109 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7111 for (unsigned Idx : InsertIndices)
7112 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7113 DAG.getIntPtrConstant(Idx, DL));
7118 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7119 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7120 Op.getScalarValueSizeInBits() == 1 &&
7121 "Can not convert non-constant vector");
7122 uint64_t Immediate = 0;
7123 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7124 SDValue In = Op.getOperand(idx);
7126 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7129 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7130 return DAG.getConstant(Immediate, dl, VT);
7132 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7133 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
7134 const X86Subtarget &Subtarget) {
7136 MVT VT = Op.getSimpleValueType();
7137 assert((VT.getVectorElementType() == MVT::i1) &&
7138 "Unexpected type in LowerBUILD_VECTORvXi1!");
7141 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7144 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7147 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7148 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7149 // Split the pieces.
7151 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7153 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7154 // We have to manually lower both halves so getNode doesn't try to
7155 // reassemble the build_vector.
7156 Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
7157 Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
7158 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7160 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7161 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7162 return DAG.getBitcast(VT, Imm);
7163 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7164 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7165 DAG.getIntPtrConstant(0, dl));
7168 // Vector has one or more non-const elements
7169 uint64_t Immediate = 0;
7170 SmallVector<unsigned, 16> NonConstIdx;
7171 bool IsSplat = true;
7172 bool HasConstElts = false;
7174 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7175 SDValue In = Op.getOperand(idx);
7178 if (!isa<ConstantSDNode>(In))
7179 NonConstIdx.push_back(idx);
7181 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7182 HasConstElts = true;
7186 else if (In != Op.getOperand(SplatIdx))
7190 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7192 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7193 DAG.getConstant(1, dl, VT),
7194 DAG.getConstant(0, dl, VT));
7196 // insert elements one by one
7200 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7201 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7203 else if (HasConstElts)
7204 Imm = DAG.getConstant(0, dl, VT);
7206 Imm = DAG.getUNDEF(VT);
7207 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7208 DstVec = DAG.getBitcast(VT, Imm);
7210 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7211 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7212 DAG.getIntPtrConstant(0, dl));
7215 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7216 unsigned InsertIdx = NonConstIdx[i];
7217 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7218 Op.getOperand(InsertIdx),
7219 DAG.getIntPtrConstant(InsertIdx, dl));
7224 /// \brief Return true if \p N implements a horizontal binop and return the
7225 /// operands for the horizontal binop into V0 and V1.
7227 /// This is a helper function of LowerToHorizontalOp().
7228 /// This function checks that the build_vector \p N in input implements a
7229 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7230 /// operation to match.
7231 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7232 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7233 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7236 /// This function only analyzes elements of \p N whose indices are
7237 /// in range [BaseIdx, LastIdx).
7238 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7240 unsigned BaseIdx, unsigned LastIdx,
7241 SDValue &V0, SDValue &V1) {
7242 EVT VT = N->getValueType(0);
7244 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7245 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7246 "Invalid Vector in input!");
7248 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7249 bool CanFold = true;
7250 unsigned ExpectedVExtractIdx = BaseIdx;
7251 unsigned NumElts = LastIdx - BaseIdx;
7252 V0 = DAG.getUNDEF(VT);
7253 V1 = DAG.getUNDEF(VT);
7255 // Check if N implements a horizontal binop.
7256 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7257 SDValue Op = N->getOperand(i + BaseIdx);
7260 if (Op->isUndef()) {
7261 // Update the expected vector extract index.
7262 if (i * 2 == NumElts)
7263 ExpectedVExtractIdx = BaseIdx;
7264 ExpectedVExtractIdx += 2;
7268 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7273 SDValue Op0 = Op.getOperand(0);
7274 SDValue Op1 = Op.getOperand(1);
7276 // Try to match the following pattern:
7277 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7278 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7279 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7280 Op0.getOperand(0) == Op1.getOperand(0) &&
7281 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7282 isa<ConstantSDNode>(Op1.getOperand(1)));
7286 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7287 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7289 if (i * 2 < NumElts) {
7291 V0 = Op0.getOperand(0);
7292 if (V0.getValueType() != VT)
7297 V1 = Op0.getOperand(0);
7298 if (V1.getValueType() != VT)
7301 if (i * 2 == NumElts)
7302 ExpectedVExtractIdx = BaseIdx;
7305 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7306 if (I0 == ExpectedVExtractIdx)
7307 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7308 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7309 // Try to match the following dag sequence:
7310 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7311 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7315 ExpectedVExtractIdx += 2;
7321 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7322 /// a concat_vector.
7324 /// This is a helper function of LowerToHorizontalOp().
7325 /// This function expects two 256-bit vectors called V0 and V1.
7326 /// At first, each vector is split into two separate 128-bit vectors.
7327 /// Then, the resulting 128-bit vectors are used to implement two
7328 /// horizontal binary operations.
7330 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7332 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7333 /// the two new horizontal binop.
7334 /// When Mode is set, the first horizontal binop dag node would take as input
7335 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7336 /// horizontal binop dag node would take as input the lower 128-bit of V1
7337 /// and the upper 128-bit of V1.
7339 /// HADD V0_LO, V0_HI
7340 /// HADD V1_LO, V1_HI
7342 /// Otherwise, the first horizontal binop dag node takes as input the lower
7343 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7344 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7346 /// HADD V0_LO, V1_LO
7347 /// HADD V0_HI, V1_HI
7349 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7350 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7351 /// the upper 128-bits of the result.
7352 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7353 const SDLoc &DL, SelectionDAG &DAG,
7354 unsigned X86Opcode, bool Mode,
7355 bool isUndefLO, bool isUndefHI) {
7356 MVT VT = V0.getSimpleValueType();
7357 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7358 "Invalid nodes in input!");
7360 unsigned NumElts = VT.getVectorNumElements();
7361 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7362 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7363 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7364 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7365 MVT NewVT = V0_LO.getSimpleValueType();
7367 SDValue LO = DAG.getUNDEF(NewVT);
7368 SDValue HI = DAG.getUNDEF(NewVT);
7371 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7372 if (!isUndefLO && !V0->isUndef())
7373 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7374 if (!isUndefHI && !V1->isUndef())
7375 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7377 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7378 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7379 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7381 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7382 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7385 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7388 /// Returns true iff \p BV builds a vector with the result equivalent to
7389 /// the result of ADDSUB operation.
7390 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7391 /// are written to the parameters \p Opnd0 and \p Opnd1.
7392 static bool isAddSub(const BuildVectorSDNode *BV,
7393 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7394 SDValue &Opnd0, SDValue &Opnd1,
7395 unsigned &NumExtracts) {
7397 MVT VT = BV->getSimpleValueType(0);
7398 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7399 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7400 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7403 unsigned NumElts = VT.getVectorNumElements();
7404 SDValue InVec0 = DAG.getUNDEF(VT);
7405 SDValue InVec1 = DAG.getUNDEF(VT);
7409 // Odd-numbered elements in the input build vector are obtained from
7410 // adding two integer/float elements.
7411 // Even-numbered elements in the input build vector are obtained from
7412 // subtracting two integer/float elements.
7413 unsigned ExpectedOpcode = ISD::FSUB;
7414 unsigned NextExpectedOpcode = ISD::FADD;
7415 bool AddFound = false;
7416 bool SubFound = false;
7418 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7419 SDValue Op = BV->getOperand(i);
7421 // Skip 'undef' values.
7422 unsigned Opcode = Op.getOpcode();
7423 if (Opcode == ISD::UNDEF) {
7424 std::swap(ExpectedOpcode, NextExpectedOpcode);
7428 // Early exit if we found an unexpected opcode.
7429 if (Opcode != ExpectedOpcode)
7432 SDValue Op0 = Op.getOperand(0);
7433 SDValue Op1 = Op.getOperand(1);
7435 // Try to match the following pattern:
7436 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7437 // Early exit if we cannot match that sequence.
7438 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7439 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7440 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7441 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7442 Op0.getOperand(1) != Op1.getOperand(1))
7445 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7449 // We found a valid add/sub node. Update the information accordingly.
7455 // Update InVec0 and InVec1.
7456 if (InVec0.isUndef()) {
7457 InVec0 = Op0.getOperand(0);
7458 if (InVec0.getSimpleValueType() != VT)
7461 if (InVec1.isUndef()) {
7462 InVec1 = Op1.getOperand(0);
7463 if (InVec1.getSimpleValueType() != VT)
7467 // Make sure that operands in input to each add/sub node always
7468 // come from a same pair of vectors.
7469 if (InVec0 != Op0.getOperand(0)) {
7470 if (ExpectedOpcode == ISD::FSUB)
7473 // FADD is commutable. Try to commute the operands
7474 // and then test again.
7475 std::swap(Op0, Op1);
7476 if (InVec0 != Op0.getOperand(0))
7480 if (InVec1 != Op1.getOperand(0))
7483 // Update the pair of expected opcodes.
7484 std::swap(ExpectedOpcode, NextExpectedOpcode);
7486 // Increment the number of extractions done.
7490 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7491 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7499 /// Returns true if is possible to fold MUL and an idiom that has already been
7500 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7501 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7502 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7504 /// Prior to calling this function it should be known that there is some
7505 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7506 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7507 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7508 /// of \p Opnd0 uses is expected to be equal to 2.
7509 /// For example, this function may be called for the following IR:
7510 /// %AB = fmul fast <2 x double> %A, %B
7511 /// %Sub = fsub fast <2 x double> %AB, %C
7512 /// %Add = fadd fast <2 x double> %AB, %C
7513 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7514 /// <2 x i32> <i32 0, i32 3>
7515 /// There is a def for %Addsub here, which potentially can be replaced by
7516 /// X86ISD::ADDSUB operation:
7517 /// %Addsub = X86ISD::ADDSUB %AB, %C
7518 /// and such ADDSUB can further be replaced with FMADDSUB:
7519 /// %Addsub = FMADDSUB %A, %B, %C.
7521 /// The main reason why this method is called before the replacement of the
7522 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7523 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7525 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
7527 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7528 unsigned ExpectedUses) {
7529 if (Opnd0.getOpcode() != ISD::FMUL ||
7530 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
7533 // FIXME: These checks must match the similar ones in
7534 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7535 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7536 // or MUL + ADDSUB to FMADDSUB.
7537 const TargetOptions &Options = DAG.getTarget().Options;
7539 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7544 Opnd1 = Opnd0.getOperand(1);
7545 Opnd0 = Opnd0.getOperand(0);
7550 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7551 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7552 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7553 const X86Subtarget &Subtarget,
7554 SelectionDAG &DAG) {
7555 SDValue Opnd0, Opnd1;
7556 unsigned NumExtracts;
7557 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts))
7560 MVT VT = BV->getSimpleValueType(0);
7563 // Try to generate X86ISD::FMADDSUB node here.
7565 // TODO: According to coverage reports, the FMADDSUB transform is not
7566 // triggered by any tests.
7567 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
7568 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7570 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7571 // the ADDSUB idiom has been successfully recognized. There are no known
7572 // X86 targets with 512-bit ADDSUB instructions!
7573 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7575 if (VT.is512BitVector())
7578 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7581 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7582 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7583 const X86Subtarget &Subtarget,
7584 SelectionDAG &DAG) {
7585 MVT VT = BV->getSimpleValueType(0);
7586 unsigned NumElts = VT.getVectorNumElements();
7587 unsigned NumUndefsLO = 0;
7588 unsigned NumUndefsHI = 0;
7589 unsigned Half = NumElts/2;
7591 // Count the number of UNDEF operands in the build_vector in input.
7592 for (unsigned i = 0, e = Half; i != e; ++i)
7593 if (BV->getOperand(i)->isUndef())
7596 for (unsigned i = Half, e = NumElts; i != e; ++i)
7597 if (BV->getOperand(i)->isUndef())
7600 // Early exit if this is either a build_vector of all UNDEFs or all the
7601 // operands but one are UNDEF.
7602 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7606 SDValue InVec0, InVec1;
7607 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7608 // Try to match an SSE3 float HADD/HSUB.
7609 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7610 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7612 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7613 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7614 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7615 // Try to match an SSSE3 integer HADD/HSUB.
7616 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7617 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7619 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7620 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7623 if (!Subtarget.hasAVX())
7626 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7627 // Try to match an AVX horizontal add/sub of packed single/double
7628 // precision floating point values from 256-bit vectors.
7629 SDValue InVec2, InVec3;
7630 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7631 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7632 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7633 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7634 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7636 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7637 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7638 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7639 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7640 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7641 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7642 // Try to match an AVX2 horizontal add/sub of signed integers.
7643 SDValue InVec2, InVec3;
7645 bool CanFold = true;
7647 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7648 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7649 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7650 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7651 X86Opcode = X86ISD::HADD;
7652 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7653 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7654 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7655 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7656 X86Opcode = X86ISD::HSUB;
7661 // Fold this build_vector into a single horizontal add/sub.
7662 // Do this only if the target has AVX2.
7663 if (Subtarget.hasAVX2())
7664 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7666 // Do not try to expand this build_vector into a pair of horizontal
7667 // add/sub if we can emit a pair of scalar add/sub.
7668 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7671 // Convert this build_vector into a pair of horizontal binop followed by
7673 bool isUndefLO = NumUndefsLO == Half;
7674 bool isUndefHI = NumUndefsHI == Half;
7675 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7676 isUndefLO, isUndefHI);
7680 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7681 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7683 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7684 X86Opcode = X86ISD::HADD;
7685 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7686 X86Opcode = X86ISD::HSUB;
7687 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7688 X86Opcode = X86ISD::FHADD;
7689 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7690 X86Opcode = X86ISD::FHSUB;
7694 // Don't try to expand this build_vector into a pair of horizontal add/sub
7695 // if we can simply emit a pair of scalar add/sub.
7696 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7699 // Convert this build_vector into two horizontal add/sub followed by
7701 bool isUndefLO = NumUndefsLO == Half;
7702 bool isUndefHI = NumUndefsHI == Half;
7703 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7704 isUndefLO, isUndefHI);
7710 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7711 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7712 /// just apply the bit to the vectors.
7713 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7714 /// from this, but enough scalar bit operations are created from the later
7715 /// legalization + scalarization stages to need basic support.
7716 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7717 SelectionDAG &DAG) {
7719 MVT VT = Op->getSimpleValueType(0);
7720 unsigned NumElems = VT.getVectorNumElements();
7721 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7723 // Check that all elements have the same opcode.
7724 // TODO: Should we allow UNDEFS and if so how many?
7725 unsigned Opcode = Op->getOperand(0).getOpcode();
7726 for (unsigned i = 1; i < NumElems; ++i)
7727 if (Opcode != Op->getOperand(i).getOpcode())
7730 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7737 // Don't do this if the buildvector is a splat - we'd replace one
7738 // constant with an entire vector.
7739 if (Op->getSplatValue())
7741 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7746 SmallVector<SDValue, 4> LHSElts, RHSElts;
7747 for (SDValue Elt : Op->ops()) {
7748 SDValue LHS = Elt.getOperand(0);
7749 SDValue RHS = Elt.getOperand(1);
7751 // We expect the canonicalized RHS operand to be the constant.
7752 if (!isa<ConstantSDNode>(RHS))
7754 LHSElts.push_back(LHS);
7755 RHSElts.push_back(RHS);
7758 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7759 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7760 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7763 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7764 /// functionality to do this, so it's all zeros, all ones, or some derivation
7765 /// that is cheap to calculate.
7766 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7767 const X86Subtarget &Subtarget) {
7769 MVT VT = Op.getSimpleValueType();
7771 // Vectors containing all zeros can be matched by pxor and xorps.
7772 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7773 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7774 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7775 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7778 return getZeroVector(VT, Subtarget, DAG, DL);
7781 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7782 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7783 // vpcmpeqd on 256-bit vectors.
7784 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7785 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7786 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7789 return getOnesVector(VT, DAG, DL);
7795 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
7796 // reasoned to be a permutation of a vector by indices in a non-constant vector.
7797 // (build_vector (extract_elt V, (extract_elt I, 0)),
7798 // (extract_elt V, (extract_elt I, 1)),
7803 // TODO: Handle undefs
7804 // TODO: Utilize pshufb and zero mask blending to support more efficient
7805 // construction of vectors with constant-0 elements.
7806 // TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
7807 // when no native operation available.
7809 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
7810 const X86Subtarget &Subtarget) {
7811 // Look for VPERMV and PSHUFB opportunities.
7812 MVT VT = V.getSimpleValueType();
7813 switch (VT.SimpleTy) {
7817 if (!Subtarget.hasSSE3())
7822 if (!Subtarget.hasAVX2())
7827 if (!Subtarget.hasVLX())
7834 if (!Subtarget.hasAVX512())
7838 if (!Subtarget.hasBWI())
7843 if (!Subtarget.hasVLX() || !Subtarget.hasBWI())
7847 if (!Subtarget.hasVBMI())
7851 if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())
7855 SDValue SrcVec, IndicesVec;
7856 // Check for a match of the permute source vector and permute index elements.
7857 // This is done by checking that the i-th build_vector operand is of the form:
7858 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
7859 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
7860 SDValue Op = V.getOperand(Idx);
7861 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7864 // If this is the first extract encountered in V, set the source vector,
7865 // otherwise verify the extract is from the previously defined source
7868 SrcVec = Op.getOperand(0);
7869 else if (SrcVec != Op.getOperand(0))
7871 SDValue ExtractedIndex = Op->getOperand(1);
7872 // Peek through extends.
7873 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
7874 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
7875 ExtractedIndex = ExtractedIndex.getOperand(0);
7876 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7879 // If this is the first extract from the index vector candidate, set the
7880 // indices vector, otherwise verify the extract is from the previously
7881 // defined indices vector.
7883 IndicesVec = ExtractedIndex.getOperand(0);
7884 else if (IndicesVec != ExtractedIndex.getOperand(0))
7887 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
7888 if (!PermIdx || PermIdx->getZExtValue() != Idx)
7892 if (VT.isFloatingPoint())
7893 IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
7894 VT.getVectorNumElements());
7895 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
7896 if (SrcVec.getValueSizeInBits() < IndicesVT.getSizeInBits()) {
7898 DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(SrcVec), VT, DAG.getUNDEF(VT),
7899 SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec)));
7901 if (VT == MVT::v16i8)
7902 return DAG.getNode(X86ISD::PSHUFB, SDLoc(V), VT, SrcVec, IndicesVec);
7903 return DAG.getNode(X86ISD::VPERMV, SDLoc(V), VT, IndicesVec, SrcVec);
7907 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7910 MVT VT = Op.getSimpleValueType();
7911 MVT ExtVT = VT.getVectorElementType();
7912 unsigned NumElems = Op.getNumOperands();
7914 // Generate vectors for predicate vectors.
7915 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7916 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
7918 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7919 return VectorConstant;
7921 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7922 // TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB
7924 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7926 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7927 return HorizontalOp;
7928 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7930 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7933 unsigned EVTBits = ExtVT.getSizeInBits();
7935 unsigned NumZero = 0;
7936 unsigned NumNonZero = 0;
7937 uint64_t NonZeros = 0;
7938 bool IsAllConstants = true;
7939 SmallSet<SDValue, 8> Values;
7940 unsigned NumConstants = NumElems;
7941 for (unsigned i = 0; i < NumElems; ++i) {
7942 SDValue Elt = Op.getOperand(i);
7946 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
7947 IsAllConstants = false;
7950 if (X86::isZeroNode(Elt))
7953 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7954 NonZeros |= ((uint64_t)1 << i);
7959 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7960 if (NumNonZero == 0)
7961 return DAG.getUNDEF(VT);
7963 // If we are inserting one variable into a vector of non-zero constants, try
7964 // to avoid loading each constant element as a scalar. Load the constants as a
7965 // vector and then insert the variable scalar element. If insertion is not
7966 // supported, we assume that we will fall back to a shuffle to get the scalar
7967 // blended with the constants. Insertion into a zero vector is handled as a
7968 // special-case somewhere below here.
7969 LLVMContext &Context = *DAG.getContext();
7970 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
7971 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
7972 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
7973 // Create an all-constant vector. The variable element in the old
7974 // build vector is replaced by undef in the constant vector. Save the
7975 // variable scalar element and its index for use in the insertelement.
7976 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
7977 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
7980 for (unsigned i = 0; i != NumElems; ++i) {
7981 SDValue Elt = Op.getOperand(i);
7982 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
7983 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
7984 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
7985 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
7986 else if (!Elt.isUndef()) {
7987 assert(!VarElt.getNode() && !InsIndex.getNode() &&
7988 "Expected one variable element in this vector");
7990 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
7993 Constant *CV = ConstantVector::get(ConstVecOps);
7994 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
7996 // The constants we just created may not be legal (eg, floating point). We
7997 // must lower the vector right here because we can not guarantee that we'll
7998 // legalize it before loading it. This is also why we could not just create
7999 // a new build vector here. If the build vector contains illegal constants,
8000 // it could get split back up into a series of insert elements.
8001 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8002 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8003 MachineFunction &MF = DAG.getMachineFunction();
8004 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
8005 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8006 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8009 // Special case for single non-zero, non-undef, element.
8010 if (NumNonZero == 1) {
8011 unsigned Idx = countTrailingZeros(NonZeros);
8012 SDValue Item = Op.getOperand(Idx);
8014 // If this is an insertion of an i64 value on x86-32, and if the top bits of
8015 // the value are obviously zero, truncate the value to i32 and do the
8016 // insertion that way. Only do this if the value is non-constant or if the
8017 // value is a constant being inserted into element 0. It is cheaper to do
8018 // a constant pool load than it is to do a movd + shuffle.
8019 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
8020 (!IsAllConstants || Idx == 0)) {
8021 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
8023 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
8024 MVT VecVT = MVT::v4i32;
8026 // Truncate the value (which may itself be a constant) to i32, and
8027 // convert it to a vector with movd (S2V+shuffle to zero extend).
8028 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
8029 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
8030 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
8031 Item, Idx * 2, true, Subtarget, DAG));
8035 // If we have a constant or non-constant insertion into the low element of
8036 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8037 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8038 // depending on what the source datatype is.
8041 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8043 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
8044 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
8045 assert((VT.is128BitVector() || VT.is256BitVector() ||
8046 VT.is512BitVector()) &&
8047 "Expected an SSE value type!");
8048 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8049 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
8050 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8053 // We can't directly insert an i8 or i16 into a vector, so zero extend
8055 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
8056 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8057 if (VT.getSizeInBits() >= 256) {
8058 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
8059 if (Subtarget.hasAVX()) {
8060 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8061 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8063 // Without AVX, we need to extend to a 128-bit vector and then
8064 // insert into the 256-bit vector.
8065 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8066 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
8067 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
8070 assert(VT.is128BitVector() && "Expected an SSE value type!");
8071 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8072 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8074 return DAG.getBitcast(VT, Item);
8078 // Is it a vector logical left shift?
8079 if (NumElems == 2 && Idx == 1 &&
8080 X86::isZeroNode(Op.getOperand(0)) &&
8081 !X86::isZeroNode(Op.getOperand(1))) {
8082 unsigned NumBits = VT.getSizeInBits();
8083 return getVShift(true, VT,
8084 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8085 VT, Op.getOperand(1)),
8086 NumBits/2, DAG, *this, dl);
8089 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8092 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8093 // is a non-constant being inserted into an element other than the low one,
8094 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8095 // movd/movss) to move this into the low element, then shuffle it into
8097 if (EVTBits == 32) {
8098 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8099 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8103 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8104 if (Values.size() == 1) {
8105 if (EVTBits == 32) {
8106 // Instead of a shuffle like this:
8107 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8108 // Check if it's possible to issue this instead.
8109 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8110 unsigned Idx = countTrailingZeros(NonZeros);
8111 SDValue Item = Op.getOperand(Idx);
8112 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8113 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8118 // A vector full of immediates; various special cases are already
8119 // handled, so this is best done with a single constant-pool load.
8123 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8126 // See if we can use a vector load to get all of the elements.
8127 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
8128 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8130 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8134 // For AVX-length vectors, build the individual 128-bit pieces and use
8135 // shuffles to put them in place.
8136 if (VT.is256BitVector() || VT.is512BitVector()) {
8137 EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);
8139 // Build both the lower and upper subvector.
8141 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8142 SDValue Upper = DAG.getBuildVector(
8143 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8145 // Recreate the wider vector with the lower and upper part.
8146 if (VT.is256BitVector())
8147 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
8148 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
8151 // Let legalizer expand 2-wide build_vectors.
8152 if (EVTBits == 64) {
8153 if (NumNonZero == 1) {
8154 // One half is zero or undef.
8155 unsigned Idx = countTrailingZeros(NonZeros);
8156 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8157 Op.getOperand(Idx));
8158 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8163 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8164 if (EVTBits == 8 && NumElems == 16)
8165 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
8169 if (EVTBits == 16 && NumElems == 8)
8170 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
8174 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8175 if (EVTBits == 32 && NumElems == 4)
8176 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8179 // If element VT is == 32 bits, turn it into a number of shuffles.
8180 if (NumElems == 4 && NumZero > 0) {
8181 SmallVector<SDValue, 8> Ops(NumElems);
8182 for (unsigned i = 0; i < 4; ++i) {
8183 bool isZero = !(NonZeros & (1ULL << i));
8185 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8187 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8190 for (unsigned i = 0; i < 2; ++i) {
8191 switch ((NonZeros >> (i*2)) & 0x3) {
8192 default: llvm_unreachable("Unexpected NonZero count");
8194 Ops[i] = Ops[i*2]; // Must be a zero vector.
8197 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8200 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8203 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8208 bool Reverse1 = (NonZeros & 0x3) == 2;
8209 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
8213 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8214 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8216 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8219 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8221 // Check for a build vector from mostly shuffle plus few inserting.
8222 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8225 // For SSE 4.1, use insertps to put the high elements into the low element.
8226 if (Subtarget.hasSSE41()) {
8228 if (!Op.getOperand(0).isUndef())
8229 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8231 Result = DAG.getUNDEF(VT);
8233 for (unsigned i = 1; i < NumElems; ++i) {
8234 if (Op.getOperand(i).isUndef()) continue;
8235 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8236 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8241 // Otherwise, expand into a number of unpckl*, start by extending each of
8242 // our (non-undef) elements to the full vector width with the element in the
8243 // bottom slot of the vector (which generates no code for SSE).
8244 SmallVector<SDValue, 8> Ops(NumElems);
8245 for (unsigned i = 0; i < NumElems; ++i) {
8246 if (!Op.getOperand(i).isUndef())
8247 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8249 Ops[i] = DAG.getUNDEF(VT);
8252 // Next, we iteratively mix elements, e.g. for v4f32:
8253 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8254 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8255 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8256 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8257 // Generate scaled UNPCKL shuffle mask.
8258 SmallVector<int, 16> Mask;
8259 for(unsigned i = 0; i != Scale; ++i)
8261 for (unsigned i = 0; i != Scale; ++i)
8262 Mask.push_back(NumElems+i);
8263 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8265 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8266 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8271 // 256-bit AVX can use the vinsertf128 instruction
8272 // to create 256-bit vectors from two other 128-bit ones.
8273 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
8275 MVT ResVT = Op.getSimpleValueType();
8277 assert((ResVT.is256BitVector() ||
8278 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8280 SDValue V1 = Op.getOperand(0);
8281 SDValue V2 = Op.getOperand(1);
8282 unsigned NumElems = ResVT.getVectorNumElements();
8283 if (ResVT.is256BitVector())
8284 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8286 if (Op.getNumOperands() == 4) {
8287 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8288 ResVT.getVectorNumElements()/2);
8289 SDValue V3 = Op.getOperand(2);
8290 SDValue V4 = Op.getOperand(3);
8291 return concat256BitVectors(
8292 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
8293 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
8296 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8299 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8300 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8301 static bool isExpandWithZeros(const SDValue &Op) {
8302 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8303 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8305 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8306 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8312 // Returns true if the given node is a type promotion (by concatenating i1
8313 // zeros) of the result of a node that already zeros all upper bits of
8315 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8316 unsigned Opc = Op.getOpcode();
8318 assert(Opc == ISD::CONCAT_VECTORS &&
8319 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8320 "Unexpected node to check for type promotion!");
8322 // As long as we are concatenating zeros to the upper part of a previous node
8323 // result, climb up the tree until a node with different opcode is
8325 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8326 if (Opc == ISD::INSERT_SUBVECTOR) {
8327 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8328 Op.getConstantOperandVal(2) == 0)
8329 Op = Op.getOperand(1);
8332 } else { // Opc == ISD::CONCAT_VECTORS
8333 if (isExpandWithZeros(Op))
8334 Op = Op.getOperand(0);
8338 Opc = Op.getOpcode();
8341 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8342 // of a node that zeros the upper bits (its masked version).
8343 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8344 (Op.getOpcode() == ISD::AND &&
8345 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8346 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8353 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8354 const X86Subtarget &Subtarget,
8355 SelectionDAG & DAG) {
8357 MVT ResVT = Op.getSimpleValueType();
8358 unsigned NumOperands = Op.getNumOperands();
8360 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
8361 "Unexpected number of operands in CONCAT_VECTORS");
8363 // If this node promotes - by concatenating zeroes - the type of the result
8364 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8365 // output register, mark it as legal and catch the pattern in instruction
8366 // selection to avoid emitting extra instructions (for zeroing upper bits).
8367 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
8368 SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
8369 SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
8370 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
8374 unsigned NumZero = 0;
8375 unsigned NumNonZero = 0;
8376 uint64_t NonZeros = 0;
8377 for (unsigned i = 0; i != NumOperands; ++i) {
8378 SDValue SubVec = Op.getOperand(i);
8379 if (SubVec.isUndef())
8381 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8384 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8385 NonZeros |= (uint64_t)1 << i;
8391 // If there are zero or one non-zeros we can handle this very simply.
8392 if (NumNonZero <= 1) {
8393 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8394 : DAG.getUNDEF(ResVT);
8397 unsigned Idx = countTrailingZeros(NonZeros);
8398 SDValue SubVec = Op.getOperand(Idx);
8399 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
8400 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
8401 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
8404 if (NumOperands > 2) {
8405 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8406 ResVT.getVectorNumElements()/2);
8407 ArrayRef<SDUse> Ops = Op->ops();
8408 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8409 Ops.slice(0, NumOperands/2));
8410 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8411 Ops.slice(NumOperands/2));
8412 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8415 assert(NumNonZero == 2 && "Simple cases not handled?");
8417 if (ResVT.getVectorNumElements() >= 16)
8418 return Op; // The operation is legal with KUNPCK
8420 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
8421 DAG.getUNDEF(ResVT), Op.getOperand(0),
8422 DAG.getIntPtrConstant(0, dl));
8423 unsigned NumElems = ResVT.getVectorNumElements();
8424 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
8425 DAG.getIntPtrConstant(NumElems/2, dl));
8428 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8429 const X86Subtarget &Subtarget,
8430 SelectionDAG &DAG) {
8431 MVT VT = Op.getSimpleValueType();
8432 if (VT.getVectorElementType() == MVT::i1)
8433 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8435 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8436 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8437 Op.getNumOperands() == 4)));
8439 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8440 // from two other 128-bit ones.
8442 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8443 return LowerAVXCONCAT_VECTORS(Op, DAG);
8446 //===----------------------------------------------------------------------===//
8447 // Vector shuffle lowering
8449 // This is an experimental code path for lowering vector shuffles on x86. It is
8450 // designed to handle arbitrary vector shuffles and blends, gracefully
8451 // degrading performance as necessary. It works hard to recognize idiomatic
8452 // shuffles and lower them to optimal instruction patterns without leaving
8453 // a framework that allows reasonably efficient handling of all vector shuffle
8455 //===----------------------------------------------------------------------===//
8457 /// \brief Tiny helper function to identify a no-op mask.
8459 /// This is a somewhat boring predicate function. It checks whether the mask
8460 /// array input, which is assumed to be a single-input shuffle mask of the kind
8461 /// used by the X86 shuffle instructions (not a fully general
8462 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8463 /// in-place shuffle are 'no-op's.
8464 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8465 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8466 assert(Mask[i] >= -1 && "Out of bound mask element!");
8467 if (Mask[i] >= 0 && Mask[i] != i)
8473 /// \brief Test whether there are elements crossing 128-bit lanes in this
8476 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8477 /// and we routinely test for these.
8478 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8479 int LaneSize = 128 / VT.getScalarSizeInBits();
8480 int Size = Mask.size();
8481 for (int i = 0; i < Size; ++i)
8482 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8487 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8489 /// This checks a shuffle mask to see if it is performing the same
8490 /// lane-relative shuffle in each sub-lane. This trivially implies
8491 /// that it is also not lane-crossing. It may however involve a blend from the
8492 /// same lane of a second vector.
8494 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8495 /// non-trivial to compute in the face of undef lanes. The representation is
8496 /// suitable for use with existing 128-bit shuffles as entries from the second
8497 /// vector have been remapped to [LaneSize, 2*LaneSize).
8498 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8500 SmallVectorImpl<int> &RepeatedMask) {
8501 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8502 RepeatedMask.assign(LaneSize, -1);
8503 int Size = Mask.size();
8504 for (int i = 0; i < Size; ++i) {
8505 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8508 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8509 // This entry crosses lanes, so there is no way to model this shuffle.
8512 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8513 // Adjust second vector indices to start at LaneSize instead of Size.
8514 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8515 : Mask[i] % LaneSize + LaneSize;
8516 if (RepeatedMask[i % LaneSize] < 0)
8517 // This is the first non-undef entry in this slot of a 128-bit lane.
8518 RepeatedMask[i % LaneSize] = LocalM;
8519 else if (RepeatedMask[i % LaneSize] != LocalM)
8520 // Found a mismatch with the repeated mask.
8526 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8528 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8529 SmallVectorImpl<int> &RepeatedMask) {
8530 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8533 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8535 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8536 SmallVectorImpl<int> &RepeatedMask) {
8537 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8540 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8541 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8542 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8544 SmallVectorImpl<int> &RepeatedMask) {
8545 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8546 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8547 int Size = Mask.size();
8548 for (int i = 0; i < Size; ++i) {
8549 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8550 if (Mask[i] == SM_SentinelUndef)
8552 if (Mask[i] == SM_SentinelZero) {
8553 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8555 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8558 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8559 // This entry crosses lanes, so there is no way to model this shuffle.
8562 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8563 // Adjust second vector indices to start at LaneSize instead of Size.
8565 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8566 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8567 // This is the first non-undef entry in this slot of a 128-bit lane.
8568 RepeatedMask[i % LaneSize] = LocalM;
8569 else if (RepeatedMask[i % LaneSize] != LocalM)
8570 // Found a mismatch with the repeated mask.
8576 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8579 /// This is a fast way to test a shuffle mask against a fixed pattern:
8581 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8583 /// It returns true if the mask is exactly as wide as the argument list, and
8584 /// each element of the mask is either -1 (signifying undef) or the value given
8585 /// in the argument.
8586 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8587 ArrayRef<int> ExpectedMask) {
8588 if (Mask.size() != ExpectedMask.size())
8591 int Size = Mask.size();
8593 // If the values are build vectors, we can look through them to find
8594 // equivalent inputs that make the shuffles equivalent.
8595 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8596 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8598 for (int i = 0; i < Size; ++i) {
8599 assert(Mask[i] >= -1 && "Out of bound mask element!");
8600 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8601 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8602 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8603 if (!MaskBV || !ExpectedBV ||
8604 MaskBV->getOperand(Mask[i] % Size) !=
8605 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8613 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8615 /// The masks must be exactly the same width.
8617 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8618 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8620 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8621 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8622 ArrayRef<int> ExpectedMask) {
8623 int Size = Mask.size();
8624 if (Size != (int)ExpectedMask.size())
8627 for (int i = 0; i < Size; ++i)
8628 if (Mask[i] == SM_SentinelUndef)
8630 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8632 else if (Mask[i] != ExpectedMask[i])
8638 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8640 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8641 const APInt &Zeroable) {
8642 int NumElts = Mask.size();
8643 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8645 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8646 for (int i = 0; i != NumElts; ++i) {
8648 if (M == SM_SentinelUndef)
8650 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8651 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8656 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8658 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8659 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8662 SmallVector<int, 8> Unpcklwd;
8663 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8664 /* Unary = */ false);
8665 SmallVector<int, 8> Unpckhwd;
8666 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8667 /* Unary = */ false);
8668 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8669 isTargetShuffleEquivalent(Mask, Unpckhwd));
8670 return IsUnpackwdMask;
8673 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8675 /// This helper function produces an 8-bit shuffle immediate corresponding to
8676 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8677 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8680 /// NB: We rely heavily on "undef" masks preserving the input lane.
8681 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8682 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8683 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8684 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8685 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8686 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8689 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8690 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8691 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8692 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8696 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8697 SelectionDAG &DAG) {
8698 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8701 /// \brief Compute whether each element of a shuffle is zeroable.
8703 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8704 /// Either it is an undef element in the shuffle mask, the element of the input
8705 /// referenced is undef, or the element of the input referenced is known to be
8706 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8707 /// as many lanes with this technique as possible to simplify the remaining
8709 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8710 SDValue V1, SDValue V2) {
8711 APInt Zeroable(Mask.size(), 0);
8712 V1 = peekThroughBitcasts(V1);
8713 V2 = peekThroughBitcasts(V2);
8715 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8716 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8718 int VectorSizeInBits = V1.getValueSizeInBits();
8719 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8720 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8722 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8724 // Handle the easy cases.
8725 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8730 // Determine shuffle input and normalize the mask.
8731 SDValue V = M < Size ? V1 : V2;
8734 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8735 if (V.getOpcode() != ISD::BUILD_VECTOR)
8738 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8739 // the (larger) source element must be UNDEF/ZERO.
8740 if ((Size % V.getNumOperands()) == 0) {
8741 int Scale = Size / V->getNumOperands();
8742 SDValue Op = V.getOperand(M / Scale);
8743 if (Op.isUndef() || X86::isZeroNode(Op))
8745 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8746 APInt Val = Cst->getAPIntValue();
8747 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8748 Val = Val.getLoBits(ScalarSizeInBits);
8751 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8752 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8753 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8754 Val = Val.getLoBits(ScalarSizeInBits);
8761 // If the BUILD_VECTOR has more elements then all the (smaller) source
8762 // elements must be UNDEF or ZERO.
8763 if ((V.getNumOperands() % Size) == 0) {
8764 int Scale = V->getNumOperands() / Size;
8765 bool AllZeroable = true;
8766 for (int j = 0; j < Scale; ++j) {
8767 SDValue Op = V.getOperand((M * Scale) + j);
8768 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8779 // The Shuffle result is as follow:
8780 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8781 // Each Zeroable's element correspond to a particular Mask's element.
8782 // As described in computeZeroableShuffleElements function.
8784 // The function looks for a sub-mask that the nonzero elements are in
8785 // increasing order. If such sub-mask exist. The function returns true.
8786 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8787 ArrayRef<int> Mask, const EVT &VectorType,
8788 bool &IsZeroSideLeft) {
8789 int NextElement = -1;
8790 // Check if the Mask's nonzero elements are in increasing order.
8791 for (int i = 0, e = Mask.size(); i < e; i++) {
8792 // Checks if the mask's zeros elements are built from only zeros.
8793 assert(Mask[i] >= -1 && "Out of bound mask element!");
8798 // Find the lowest non zero element
8799 if (NextElement < 0) {
8800 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8801 IsZeroSideLeft = NextElement != 0;
8803 // Exit if the mask's non zero elements are not in increasing order.
8804 if (NextElement != Mask[i])
8811 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8812 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8813 ArrayRef<int> Mask, SDValue V1,
8815 const APInt &Zeroable,
8816 const X86Subtarget &Subtarget,
8817 SelectionDAG &DAG) {
8818 int Size = Mask.size();
8819 int LaneSize = 128 / VT.getScalarSizeInBits();
8820 const int NumBytes = VT.getSizeInBits() / 8;
8821 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8823 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8824 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8825 (Subtarget.hasBWI() && VT.is512BitVector()));
8827 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8828 // Sign bit set in i8 mask means zero element.
8829 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8832 for (int i = 0; i < NumBytes; ++i) {
8833 int M = Mask[i / NumEltBytes];
8835 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8838 if (Zeroable[i / NumEltBytes]) {
8839 PSHUFBMask[i] = ZeroMask;
8843 // We can only use a single input of V1 or V2.
8844 SDValue SrcV = (M >= Size ? V2 : V1);
8850 // PSHUFB can't cross lanes, ensure this doesn't happen.
8851 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8855 M = M * NumEltBytes + (i % NumEltBytes);
8856 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8858 assert(V && "Failed to find a source input");
8860 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8861 return DAG.getBitcast(
8862 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8863 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8866 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8867 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8870 // X86 has dedicated shuffle that can be lowered to VEXPAND
8871 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8872 const APInt &Zeroable,
8873 ArrayRef<int> Mask, SDValue &V1,
8874 SDValue &V2, SelectionDAG &DAG,
8875 const X86Subtarget &Subtarget) {
8876 bool IsLeftZeroSide = true;
8877 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8880 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8882 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8883 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8884 unsigned NumElts = VT.getVectorNumElements();
8885 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8886 "Unexpected number of vector elements");
8887 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8888 Subtarget, DAG, DL);
8889 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8890 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8891 return DAG.getSelect(DL, VT, VMask,
8892 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8896 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8897 unsigned &UnpackOpcode, bool IsUnary,
8898 ArrayRef<int> TargetMask, SDLoc &DL,
8900 const X86Subtarget &Subtarget) {
8901 int NumElts = VT.getVectorNumElements();
8903 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8904 for (int i = 0; i != NumElts; i += 2) {
8905 int M1 = TargetMask[i + 0];
8906 int M2 = TargetMask[i + 1];
8907 Undef1 &= (SM_SentinelUndef == M1);
8908 Undef2 &= (SM_SentinelUndef == M2);
8909 Zero1 &= isUndefOrZero(M1);
8910 Zero2 &= isUndefOrZero(M2);
8912 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8913 "Zeroable shuffle detected");
8915 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8916 SmallVector<int, 64> Unpckl, Unpckh;
8917 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8918 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8919 UnpackOpcode = X86ISD::UNPCKL;
8920 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8921 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8925 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8926 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8927 UnpackOpcode = X86ISD::UNPCKH;
8928 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8929 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8933 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8934 if (IsUnary && (Zero1 || Zero2)) {
8935 // Don't bother if we can blend instead.
8936 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8937 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8940 bool MatchLo = true, MatchHi = true;
8941 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8942 int M = TargetMask[i];
8944 // Ignore if the input is known to be zero or the index is undef.
8945 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8946 (M == SM_SentinelUndef))
8949 MatchLo &= (M == Unpckl[i]);
8950 MatchHi &= (M == Unpckh[i]);
8953 if (MatchLo || MatchHi) {
8954 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8955 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8956 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8961 // If a binary shuffle, commute and try again.
8963 ShuffleVectorSDNode::commuteMask(Unpckl);
8964 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8965 UnpackOpcode = X86ISD::UNPCKL;
8970 ShuffleVectorSDNode::commuteMask(Unpckh);
8971 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8972 UnpackOpcode = X86ISD::UNPCKH;
8981 // X86 has dedicated unpack instructions that can handle specific blend
8982 // operations: UNPCKH and UNPCKL.
8983 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8984 ArrayRef<int> Mask, SDValue V1,
8985 SDValue V2, SelectionDAG &DAG) {
8986 SmallVector<int, 8> Unpckl;
8987 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8988 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8989 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8991 SmallVector<int, 8> Unpckh;
8992 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8993 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8994 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8996 // Commute and try again.
8997 ShuffleVectorSDNode::commuteMask(Unpckl);
8998 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8999 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9001 ShuffleVectorSDNode::commuteMask(Unpckh);
9002 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9003 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
9008 // X86 has dedicated pack instructions that can handle specific truncation
9009 // operations: PACKSS and PACKUS.
9010 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
9011 SDValue &V2, unsigned &PackOpcode,
9012 ArrayRef<int> TargetMask,
9014 const X86Subtarget &Subtarget) {
9015 unsigned NumElts = VT.getVectorNumElements();
9016 unsigned BitSize = VT.getScalarSizeInBits();
9017 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
9018 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
9020 auto MatchPACK = [&](SDValue N1, SDValue N2) {
9021 SDValue VV1 = DAG.getBitcast(PackVT, N1);
9022 SDValue VV2 = DAG.getBitcast(PackVT, N2);
9023 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
9024 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
9028 PackOpcode = X86ISD::PACKSS;
9032 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
9033 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
9034 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
9035 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
9039 PackOpcode = X86ISD::PACKUS;
9047 // Try binary shuffle.
9048 SmallVector<int, 32> BinaryMask;
9049 createPackShuffleMask(VT, BinaryMask, false);
9050 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
9051 if (MatchPACK(V1, V2))
9054 // Try unary shuffle.
9055 SmallVector<int, 32> UnaryMask;
9056 createPackShuffleMask(VT, UnaryMask, true);
9057 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
9058 if (MatchPACK(V1, V1))
9064 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
9065 ArrayRef<int> Mask, SDValue V1,
9066 SDValue V2, SelectionDAG &DAG,
9067 const X86Subtarget &Subtarget) {
9069 unsigned PackOpcode;
9070 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
9072 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
9073 DAG.getBitcast(PackVT, V2));
9078 /// \brief Try to emit a bitmask instruction for a shuffle.
9080 /// This handles cases where we can model a blend exactly as a bitmask due to
9081 /// one of the inputs being zeroable.
9082 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
9083 SDValue V2, ArrayRef<int> Mask,
9084 const APInt &Zeroable,
9085 SelectionDAG &DAG) {
9086 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
9087 MVT EltVT = VT.getVectorElementType();
9088 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9089 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9090 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
9092 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9095 if (Mask[i] % Size != i)
9096 return SDValue(); // Not a blend.
9098 V = Mask[i] < Size ? V1 : V2;
9099 else if (V != (Mask[i] < Size ? V1 : V2))
9100 return SDValue(); // Can only let one input through the mask.
9102 VMaskOps[i] = AllOnes;
9105 return SDValue(); // No non-zeroable elements!
9107 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
9108 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
9111 /// \brief Try to emit a blend instruction for a shuffle using bit math.
9113 /// This is used as a fallback approach when first class blend instructions are
9114 /// unavailable. Currently it is only suitable for integer vectors, but could
9115 /// be generalized for floating point vectors if desirable.
9116 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
9117 SDValue V2, ArrayRef<int> Mask,
9118 SelectionDAG &DAG) {
9119 assert(VT.isInteger() && "Only supports integer vector types!");
9120 MVT EltVT = VT.getVectorElementType();
9121 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9122 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9123 SmallVector<SDValue, 16> MaskOps;
9124 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9125 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
9126 return SDValue(); // Shuffled input!
9127 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
9130 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
9131 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
9132 // We have to cast V2 around.
9133 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
9134 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
9135 DAG.getBitcast(MaskVT, V1Mask),
9136 DAG.getBitcast(MaskVT, V2)));
9137 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
9140 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
9141 SDValue PreservedSrc,
9142 const X86Subtarget &Subtarget,
9145 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
9146 MutableArrayRef<int> TargetMask,
9147 bool &ForceV1Zero, bool &ForceV2Zero,
9148 uint64_t &BlendMask) {
9149 bool V1IsZeroOrUndef =
9150 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
9151 bool V2IsZeroOrUndef =
9152 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
9155 ForceV1Zero = false, ForceV2Zero = false;
9156 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
9158 // Attempt to generate the binary blend mask. If an input is zero then
9159 // we can use any lane.
9160 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
9161 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
9162 int M = TargetMask[i];
9163 if (M == SM_SentinelUndef)
9167 if (M == i + Size) {
9168 BlendMask |= 1ull << i;
9171 if (M == SM_SentinelZero) {
9172 if (V1IsZeroOrUndef) {
9177 if (V2IsZeroOrUndef) {
9179 BlendMask |= 1ull << i;
9180 TargetMask[i] = i + Size;
9189 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
9191 uint64_t ScaledMask = 0;
9192 for (int i = 0; i != Size; ++i)
9193 if (BlendMask & (1ull << i))
9194 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
9198 /// \brief Try to emit a blend instruction for a shuffle.
9200 /// This doesn't do any checks for the availability of instructions for blending
9201 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
9202 /// be matched in the backend with the type given. What it does check for is
9203 /// that the shuffle mask is a blend, or convertible into a blend with zero.
9204 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
9205 SDValue V2, ArrayRef<int> Original,
9206 const APInt &Zeroable,
9207 const X86Subtarget &Subtarget,
9208 SelectionDAG &DAG) {
9209 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
9211 uint64_t BlendMask = 0;
9212 bool ForceV1Zero = false, ForceV2Zero = false;
9213 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
9217 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
9219 V1 = getZeroVector(VT, Subtarget, DAG, DL);
9221 V2 = getZeroVector(VT, Subtarget, DAG, DL);
9223 switch (VT.SimpleTy) {
9228 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
9229 DAG.getConstant(BlendMask, DL, MVT::i8));
9233 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9237 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
9238 // that instruction.
9239 if (Subtarget.hasAVX2()) {
9240 // Scale the blend by the number of 32-bit dwords per element.
9241 int Scale = VT.getScalarSizeInBits() / 32;
9242 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9243 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
9244 V1 = DAG.getBitcast(BlendVT, V1);
9245 V2 = DAG.getBitcast(BlendVT, V2);
9246 return DAG.getBitcast(
9247 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
9248 DAG.getConstant(BlendMask, DL, MVT::i8)));
9252 // For integer shuffles we need to expand the mask and cast the inputs to
9253 // v8i16s prior to blending.
9254 int Scale = 8 / VT.getVectorNumElements();
9255 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9256 V1 = DAG.getBitcast(MVT::v8i16, V1);
9257 V2 = DAG.getBitcast(MVT::v8i16, V2);
9258 return DAG.getBitcast(VT,
9259 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
9260 DAG.getConstant(BlendMask, DL, MVT::i8)));
9264 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9265 SmallVector<int, 8> RepeatedMask;
9266 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9267 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
9268 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
9270 for (int i = 0; i < 8; ++i)
9271 if (RepeatedMask[i] >= 8)
9272 BlendMask |= 1ull << i;
9273 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
9274 DAG.getConstant(BlendMask, DL, MVT::i8));
9280 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
9281 "256-bit byte-blends require AVX2 support!");
9283 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
9285 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9286 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9287 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9290 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
9291 if (SDValue Masked =
9292 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
9295 // Scale the blend by the number of bytes per element.
9296 int Scale = VT.getScalarSizeInBits() / 8;
9298 // This form of blend is always done on bytes. Compute the byte vector
9300 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9302 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
9303 // mix of LLVM's code generator and the x86 backend. We tell the code
9304 // generator that boolean values in the elements of an x86 vector register
9305 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
9306 // mapping a select to operand #1, and 'false' mapping to operand #2. The
9307 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
9308 // of the element (the remaining are ignored) and 0 in that high bit would
9309 // mean operand #1 while 1 in the high bit would mean operand #2. So while
9310 // the LLVM model for boolean values in vector elements gets the relevant
9311 // bit set, it is set backwards and over constrained relative to x86's
9313 SmallVector<SDValue, 32> VSELECTMask;
9314 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9315 for (int j = 0; j < Scale; ++j)
9316 VSELECTMask.push_back(
9317 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
9318 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
9321 V1 = DAG.getBitcast(BlendVT, V1);
9322 V2 = DAG.getBitcast(BlendVT, V2);
9323 return DAG.getBitcast(
9325 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
9335 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9336 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9337 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9340 llvm_unreachable("Not a supported integer vector type!");
9344 /// \brief Try to lower as a blend of elements from two inputs followed by
9345 /// a single-input permutation.
9347 /// This matches the pattern where we can blend elements from two inputs and
9348 /// then reduce the shuffle to a single-input permutation.
9349 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
9350 SDValue V1, SDValue V2,
9352 SelectionDAG &DAG) {
9353 // We build up the blend mask while checking whether a blend is a viable way
9354 // to reduce the shuffle.
9355 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9356 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9358 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9362 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9364 if (BlendMask[Mask[i] % Size] < 0)
9365 BlendMask[Mask[i] % Size] = Mask[i];
9366 else if (BlendMask[Mask[i] % Size] != Mask[i])
9367 return SDValue(); // Can't blend in the needed input!
9369 PermuteMask[i] = Mask[i] % Size;
9372 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9373 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9376 /// \brief Generic routine to decompose a shuffle and blend into independent
9377 /// blends and permutes.
9379 /// This matches the extremely common pattern for handling combined
9380 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9381 /// operations. It will try to pick the best arrangement of shuffles and
9383 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9387 SelectionDAG &DAG) {
9388 // Shuffle the input elements into the desired positions in V1 and V2 and
9389 // blend them together.
9390 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9391 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9392 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9393 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9394 if (Mask[i] >= 0 && Mask[i] < Size) {
9395 V1Mask[i] = Mask[i];
9397 } else if (Mask[i] >= Size) {
9398 V2Mask[i] = Mask[i] - Size;
9399 BlendMask[i] = i + Size;
9402 // Try to lower with the simpler initial blend strategy unless one of the
9403 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9404 // shuffle may be able to fold with a load or other benefit. However, when
9405 // we'll have to do 2x as many shuffles in order to achieve this, blending
9406 // first is a better strategy.
9407 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9408 if (SDValue BlendPerm =
9409 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9412 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9413 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9414 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9417 /// \brief Try to lower a vector shuffle as a rotation.
9419 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9420 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9421 ArrayRef<int> Mask) {
9422 int NumElts = Mask.size();
9424 // We need to detect various ways of spelling a rotation:
9425 // [11, 12, 13, 14, 15, 0, 1, 2]
9426 // [-1, 12, 13, 14, -1, -1, 1, -1]
9427 // [-1, -1, -1, -1, -1, -1, 1, 2]
9428 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9429 // [-1, 4, 5, 6, -1, -1, 9, -1]
9430 // [-1, 4, 5, 6, -1, -1, -1, -1]
9433 for (int i = 0; i < NumElts; ++i) {
9435 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9436 "Unexpected mask index.");
9440 // Determine where a rotated vector would have started.
9441 int StartIdx = i - (M % NumElts);
9443 // The identity rotation isn't interesting, stop.
9446 // If we found the tail of a vector the rotation must be the missing
9447 // front. If we found the head of a vector, it must be how much of the
9449 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9452 Rotation = CandidateRotation;
9453 else if (Rotation != CandidateRotation)
9454 // The rotations don't match, so we can't match this mask.
9457 // Compute which value this mask is pointing at.
9458 SDValue MaskV = M < NumElts ? V1 : V2;
9460 // Compute which of the two target values this index should be assigned
9461 // to. This reflects whether the high elements are remaining or the low
9462 // elements are remaining.
9463 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9465 // Either set up this value if we've not encountered it before, or check
9466 // that it remains consistent.
9469 else if (TargetV != MaskV)
9470 // This may be a rotation, but it pulls from the inputs in some
9471 // unsupported interleaving.
9475 // Check that we successfully analyzed the mask, and normalize the results.
9476 assert(Rotation != 0 && "Failed to locate a viable rotation!");
9477 assert((Lo || Hi) && "Failed to find a rotated input vector!");
9489 /// \brief Try to lower a vector shuffle as a byte rotation.
9491 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9492 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9493 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9494 /// try to generically lower a vector shuffle through such an pattern. It
9495 /// does not check for the profitability of lowering either as PALIGNR or
9496 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9497 /// This matches shuffle vectors that look like:
9499 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9501 /// Essentially it concatenates V1 and V2, shifts right by some number of
9502 /// elements, and takes the low elements as the result. Note that while this is
9503 /// specified as a *right shift* because x86 is little-endian, it is a *left
9504 /// rotate* of the vector lanes.
9505 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9506 ArrayRef<int> Mask) {
9507 // Don't accept any shuffles with zero elements.
9508 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9511 // PALIGNR works on 128-bit lanes.
9512 SmallVector<int, 16> RepeatedMask;
9513 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9516 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9520 // PALIGNR rotates bytes, so we need to scale the
9521 // rotation based on how many bytes are in the vector lane.
9522 int NumElts = RepeatedMask.size();
9523 int Scale = 16 / NumElts;
9524 return Rotation * Scale;
9527 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9528 SDValue V1, SDValue V2,
9530 const X86Subtarget &Subtarget,
9531 SelectionDAG &DAG) {
9532 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9534 SDValue Lo = V1, Hi = V2;
9535 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9536 if (ByteRotation <= 0)
9539 // Cast the inputs to i8 vector of correct length to match PALIGNR or
9541 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9542 Lo = DAG.getBitcast(ByteVT, Lo);
9543 Hi = DAG.getBitcast(ByteVT, Hi);
9545 // SSSE3 targets can use the palignr instruction.
9546 if (Subtarget.hasSSSE3()) {
9547 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9548 "512-bit PALIGNR requires BWI instructions");
9549 return DAG.getBitcast(
9550 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9551 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9554 assert(VT.is128BitVector() &&
9555 "Rotate-based lowering only supports 128-bit lowering!");
9556 assert(Mask.size() <= 16 &&
9557 "Can shuffle at most 16 bytes in a 128-bit vector!");
9558 assert(ByteVT == MVT::v16i8 &&
9559 "SSE2 rotate lowering only needed for v16i8!");
9561 // Default SSE2 implementation
9562 int LoByteShift = 16 - ByteRotation;
9563 int HiByteShift = ByteRotation;
9565 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9566 DAG.getConstant(LoByteShift, DL, MVT::i8));
9567 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9568 DAG.getConstant(HiByteShift, DL, MVT::i8));
9569 return DAG.getBitcast(VT,
9570 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9573 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9575 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9576 /// rotation of the concatenation of two vectors; This routine will
9577 /// try to generically lower a vector shuffle through such an pattern.
9579 /// Essentially it concatenates V1 and V2, shifts right by some number of
9580 /// elements, and takes the low elements as the result. Note that while this is
9581 /// specified as a *right shift* because x86 is little-endian, it is a *left
9582 /// rotate* of the vector lanes.
9583 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9584 SDValue V1, SDValue V2,
9586 const X86Subtarget &Subtarget,
9587 SelectionDAG &DAG) {
9588 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9589 "Only 32-bit and 64-bit elements are supported!");
9591 // 128/256-bit vectors are only supported with VLX.
9592 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9593 && "VLX required for 128/256-bit vectors");
9595 SDValue Lo = V1, Hi = V2;
9596 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9600 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9601 DAG.getConstant(Rotation, DL, MVT::i8));
9604 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9606 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9607 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9608 /// matches elements from one of the input vectors shuffled to the left or
9609 /// right with zeroable elements 'shifted in'. It handles both the strictly
9610 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9613 /// PSHL : (little-endian) left bit shift.
9614 /// [ zz, 0, zz, 2 ]
9615 /// [ -1, 4, zz, -1 ]
9616 /// PSRL : (little-endian) right bit shift.
9618 /// [ -1, -1, 7, zz]
9619 /// PSLLDQ : (little-endian) left byte shift
9620 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9621 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9622 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9623 /// PSRLDQ : (little-endian) right byte shift
9624 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9625 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9626 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9627 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9628 unsigned ScalarSizeInBits,
9629 ArrayRef<int> Mask, int MaskOffset,
9630 const APInt &Zeroable,
9631 const X86Subtarget &Subtarget) {
9632 int Size = Mask.size();
9633 unsigned SizeInBits = Size * ScalarSizeInBits;
9635 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9636 for (int i = 0; i < Size; i += Scale)
9637 for (int j = 0; j < Shift; ++j)
9638 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9644 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9645 for (int i = 0; i != Size; i += Scale) {
9646 unsigned Pos = Left ? i + Shift : i;
9647 unsigned Low = Left ? i : i + Shift;
9648 unsigned Len = Scale - Shift;
9649 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9653 int ShiftEltBits = ScalarSizeInBits * Scale;
9654 bool ByteShift = ShiftEltBits > 64;
9655 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9656 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9657 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9659 // Normalize the scale for byte shifts to still produce an i64 element
9661 Scale = ByteShift ? Scale / 2 : Scale;
9663 // We need to round trip through the appropriate type for the shift.
9664 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9665 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9666 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9667 return (int)ShiftAmt;
9670 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9671 // keep doubling the size of the integer elements up to that. We can
9672 // then shift the elements of the integer vector by whole multiples of
9673 // their width within the elements of the larger integer vector. Test each
9674 // multiple to see if we can find a match with the moved element indices
9675 // and that the shifted in elements are all zeroable.
9676 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9677 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9678 for (int Shift = 1; Shift != Scale; ++Shift)
9679 for (bool Left : {true, false})
9680 if (CheckZeros(Shift, Scale, Left)) {
9681 int ShiftAmt = MatchShift(Shift, Scale, Left);
9690 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9691 SDValue V2, ArrayRef<int> Mask,
9692 const APInt &Zeroable,
9693 const X86Subtarget &Subtarget,
9694 SelectionDAG &DAG) {
9695 int Size = Mask.size();
9696 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9702 // Try to match shuffle against V1 shift.
9703 int ShiftAmt = matchVectorShuffleAsShift(
9704 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9706 // If V1 failed, try to match shuffle against V2 shift.
9709 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9710 Mask, Size, Zeroable, Subtarget);
9717 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9718 "Illegal integer vector type");
9719 V = DAG.getBitcast(ShiftVT, V);
9720 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9721 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9722 return DAG.getBitcast(VT, V);
9725 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9726 // Remainder of lower half result is zero and upper half is all undef.
9727 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
9728 ArrayRef<int> Mask, uint64_t &BitLen,
9729 uint64_t &BitIdx, const APInt &Zeroable) {
9730 int Size = Mask.size();
9731 int HalfSize = Size / 2;
9732 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9733 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9735 // Upper half must be undefined.
9736 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9739 // Determine the extraction length from the part of the
9740 // lower half that isn't zeroable.
9742 for (; Len > 0; --Len)
9743 if (!Zeroable[Len - 1])
9745 assert(Len > 0 && "Zeroable shuffle mask");
9747 // Attempt to match first Len sequential elements from the lower half.
9750 for (int i = 0; i != Len; ++i) {
9752 if (M == SM_SentinelUndef)
9754 SDValue &V = (M < Size ? V1 : V2);
9757 // The extracted elements must start at a valid index and all mask
9758 // elements must be in the lower half.
9759 if (i > M || M >= HalfSize)
9762 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9770 if (!Src || Idx < 0)
9773 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9774 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9775 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9780 // INSERTQ: Extract lowest Len elements from lower half of second source and
9781 // insert over first source, starting at Idx.
9782 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9783 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
9784 ArrayRef<int> Mask, uint64_t &BitLen,
9786 int Size = Mask.size();
9787 int HalfSize = Size / 2;
9788 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9790 // Upper half must be undefined.
9791 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9794 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9797 // Attempt to match first source from mask before insertion point.
9798 if (isUndefInRange(Mask, 0, Idx)) {
9800 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9802 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9808 // Extend the extraction length looking to match both the insertion of
9809 // the second source and the remaining elements of the first.
9810 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9815 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9817 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9823 // Match the remaining elements of the lower half.
9824 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9826 } else if ((!Base || (Base == V1)) &&
9827 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9829 } else if ((!Base || (Base == V2)) &&
9830 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9837 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9838 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9848 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9849 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9850 SDValue V2, ArrayRef<int> Mask,
9851 const APInt &Zeroable,
9852 SelectionDAG &DAG) {
9853 uint64_t BitLen, BitIdx;
9854 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
9855 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
9856 DAG.getConstant(BitLen, DL, MVT::i8),
9857 DAG.getConstant(BitIdx, DL, MVT::i8));
9859 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
9860 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
9861 V2 ? V2 : DAG.getUNDEF(VT),
9862 DAG.getConstant(BitLen, DL, MVT::i8),
9863 DAG.getConstant(BitIdx, DL, MVT::i8));
9868 /// \brief Lower a vector shuffle as a zero or any extension.
9870 /// Given a specific number of elements, element bit width, and extension
9871 /// stride, produce either a zero or any extension based on the available
9872 /// features of the subtarget. The extended elements are consecutive and
9873 /// begin and can start from an offsetted element index in the input; to
9874 /// avoid excess shuffling the offset must either being in the bottom lane
9875 /// or at the start of a higher lane. All extended elements must be from
9877 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9878 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9879 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9880 assert(Scale > 1 && "Need a scale to extend.");
9881 int EltBits = VT.getScalarSizeInBits();
9882 int NumElements = VT.getVectorNumElements();
9883 int NumEltsPerLane = 128 / EltBits;
9884 int OffsetLane = Offset / NumEltsPerLane;
9885 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9886 "Only 8, 16, and 32 bit elements can be extended.");
9887 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9888 assert(0 <= Offset && "Extension offset must be positive.");
9889 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9890 "Extension offset must be in the first lane or start an upper lane.");
9892 // Check that an index is in same lane as the base offset.
9893 auto SafeOffset = [&](int Idx) {
9894 return OffsetLane == (Idx / NumEltsPerLane);
9897 // Shift along an input so that the offset base moves to the first element.
9898 auto ShuffleOffset = [&](SDValue V) {
9902 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9903 for (int i = 0; i * Scale < NumElements; ++i) {
9904 int SrcIdx = i + Offset;
9905 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9907 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9910 // Found a valid zext mask! Try various lowering strategies based on the
9911 // input type and available ISA extensions.
9912 if (Subtarget.hasSSE41()) {
9913 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9914 // PUNPCK will catch this in a later shuffle match.
9915 if (Offset && Scale == 2 && VT.is128BitVector())
9917 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9918 NumElements / Scale);
9919 InputV = ShuffleOffset(InputV);
9920 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9921 return DAG.getBitcast(VT, InputV);
9924 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9926 // For any extends we can cheat for larger element sizes and use shuffle
9927 // instructions that can fold with a load and/or copy.
9928 if (AnyExt && EltBits == 32) {
9929 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9931 return DAG.getBitcast(
9932 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9933 DAG.getBitcast(MVT::v4i32, InputV),
9934 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9936 if (AnyExt && EltBits == 16 && Scale > 2) {
9937 int PSHUFDMask[4] = {Offset / 2, -1,
9938 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9939 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9940 DAG.getBitcast(MVT::v4i32, InputV),
9941 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9942 int PSHUFWMask[4] = {1, -1, -1, -1};
9943 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9944 return DAG.getBitcast(
9945 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9946 DAG.getBitcast(MVT::v8i16, InputV),
9947 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9950 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9952 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9953 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9954 assert(VT.is128BitVector() && "Unexpected vector width!");
9956 int LoIdx = Offset * EltBits;
9957 SDValue Lo = DAG.getBitcast(
9958 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9959 DAG.getConstant(EltBits, DL, MVT::i8),
9960 DAG.getConstant(LoIdx, DL, MVT::i8)));
9962 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9963 !SafeOffset(Offset + 1))
9964 return DAG.getBitcast(VT, Lo);
9966 int HiIdx = (Offset + 1) * EltBits;
9967 SDValue Hi = DAG.getBitcast(
9968 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9969 DAG.getConstant(EltBits, DL, MVT::i8),
9970 DAG.getConstant(HiIdx, DL, MVT::i8)));
9971 return DAG.getBitcast(VT,
9972 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9975 // If this would require more than 2 unpack instructions to expand, use
9976 // pshufb when available. We can only use more than 2 unpack instructions
9977 // when zero extending i8 elements which also makes it easier to use pshufb.
9978 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9979 assert(NumElements == 16 && "Unexpected byte vector width!");
9980 SDValue PSHUFBMask[16];
9981 for (int i = 0; i < 16; ++i) {
9982 int Idx = Offset + (i / Scale);
9983 PSHUFBMask[i] = DAG.getConstant(
9984 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9986 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9987 return DAG.getBitcast(
9988 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9989 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9992 // If we are extending from an offset, ensure we start on a boundary that
9993 // we can unpack from.
9994 int AlignToUnpack = Offset % (NumElements / Scale);
9995 if (AlignToUnpack) {
9996 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9997 for (int i = AlignToUnpack; i < NumElements; ++i)
9998 ShMask[i - AlignToUnpack] = i;
9999 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
10000 Offset -= AlignToUnpack;
10003 // Otherwise emit a sequence of unpacks.
10005 unsigned UnpackLoHi = X86ISD::UNPCKL;
10006 if (Offset >= (NumElements / 2)) {
10007 UnpackLoHi = X86ISD::UNPCKH;
10008 Offset -= (NumElements / 2);
10011 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
10012 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
10013 : getZeroVector(InputVT, Subtarget, DAG, DL);
10014 InputV = DAG.getBitcast(InputVT, InputV);
10015 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
10019 } while (Scale > 1);
10020 return DAG.getBitcast(VT, InputV);
10023 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
10025 /// This routine will try to do everything in its power to cleverly lower
10026 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
10027 /// check for the profitability of this lowering, it tries to aggressively
10028 /// match this pattern. It will use all of the micro-architectural details it
10029 /// can to emit an efficient lowering. It handles both blends with all-zero
10030 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
10031 /// masking out later).
10033 /// The reason we have dedicated lowering for zext-style shuffles is that they
10034 /// are both incredibly common and often quite performance sensitive.
10035 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
10036 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10037 const APInt &Zeroable, const X86Subtarget &Subtarget,
10038 SelectionDAG &DAG) {
10039 int Bits = VT.getSizeInBits();
10040 int NumLanes = Bits / 128;
10041 int NumElements = VT.getVectorNumElements();
10042 int NumEltsPerLane = NumElements / NumLanes;
10043 assert(VT.getScalarSizeInBits() <= 32 &&
10044 "Exceeds 32-bit integer zero extension limit");
10045 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
10047 // Define a helper function to check a particular ext-scale and lower to it if
10049 auto Lower = [&](int Scale) -> SDValue {
10051 bool AnyExt = true;
10054 for (int i = 0; i < NumElements; ++i) {
10057 continue; // Valid anywhere but doesn't tell us anything.
10058 if (i % Scale != 0) {
10059 // Each of the extended elements need to be zeroable.
10063 // We no longer are in the anyext case.
10068 // Each of the base elements needs to be consecutive indices into the
10069 // same input vector.
10070 SDValue V = M < NumElements ? V1 : V2;
10071 M = M % NumElements;
10074 Offset = M - (i / Scale);
10075 } else if (InputV != V)
10076 return SDValue(); // Flip-flopping inputs.
10078 // Offset must start in the lowest 128-bit lane or at the start of an
10080 // FIXME: Is it ever worth allowing a negative base offset?
10081 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
10082 (Offset % NumEltsPerLane) == 0))
10085 // If we are offsetting, all referenced entries must come from the same
10087 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
10090 if ((M % NumElements) != (Offset + (i / Scale)))
10091 return SDValue(); // Non-consecutive strided elements.
10095 // If we fail to find an input, we have a zero-shuffle which should always
10096 // have already been handled.
10097 // FIXME: Maybe handle this here in case during blending we end up with one?
10101 // If we are offsetting, don't extend if we only match a single input, we
10102 // can always do better by using a basic PSHUF or PUNPCK.
10103 if (Offset != 0 && Matches < 2)
10106 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10107 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
10110 // The widest scale possible for extending is to a 64-bit integer.
10111 assert(Bits % 64 == 0 &&
10112 "The number of bits in a vector must be divisible by 64 on x86!");
10113 int NumExtElements = Bits / 64;
10115 // Each iteration, try extending the elements half as much, but into twice as
10117 for (; NumExtElements < NumElements; NumExtElements *= 2) {
10118 assert(NumElements % NumExtElements == 0 &&
10119 "The input vector size must be divisible by the extended size.");
10120 if (SDValue V = Lower(NumElements / NumExtElements))
10124 // General extends failed, but 128-bit vectors may be able to use MOVQ.
10128 // Returns one of the source operands if the shuffle can be reduced to a
10129 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
10130 auto CanZExtLowHalf = [&]() {
10131 for (int i = NumElements / 2; i != NumElements; ++i)
10134 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
10136 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
10141 if (SDValue V = CanZExtLowHalf()) {
10142 V = DAG.getBitcast(MVT::v2i64, V);
10143 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
10144 return DAG.getBitcast(VT, V);
10147 // No viable ext lowering found.
10151 /// \brief Try to get a scalar value for a specific element of a vector.
10153 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
10154 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
10155 SelectionDAG &DAG) {
10156 MVT VT = V.getSimpleValueType();
10157 MVT EltVT = VT.getVectorElementType();
10158 V = peekThroughBitcasts(V);
10160 // If the bitcasts shift the element size, we can't extract an equivalent
10161 // element from it.
10162 MVT NewVT = V.getSimpleValueType();
10163 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
10166 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10167 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
10168 // Ensure the scalar operand is the same size as the destination.
10169 // FIXME: Add support for scalar truncation where possible.
10170 SDValue S = V.getOperand(Idx);
10171 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
10172 return DAG.getBitcast(EltVT, S);
10178 /// \brief Helper to test for a load that can be folded with x86 shuffles.
10180 /// This is particularly important because the set of instructions varies
10181 /// significantly based on whether the operand is a load or not.
10182 static bool isShuffleFoldableLoad(SDValue V) {
10183 V = peekThroughBitcasts(V);
10184 return ISD::isNON_EXTLoad(V.getNode());
10187 /// \brief Try to lower insertion of a single element into a zero vector.
10189 /// This is a common pattern that we have especially efficient patterns to lower
10190 /// across all subtarget feature sets.
10191 static SDValue lowerVectorShuffleAsElementInsertion(
10192 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10193 const APInt &Zeroable, const X86Subtarget &Subtarget,
10194 SelectionDAG &DAG) {
10196 MVT EltVT = VT.getVectorElementType();
10199 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
10201 bool IsV1Zeroable = true;
10202 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10203 if (i != V2Index && !Zeroable[i]) {
10204 IsV1Zeroable = false;
10208 // Check for a single input from a SCALAR_TO_VECTOR node.
10209 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
10210 // all the smarts here sunk into that routine. However, the current
10211 // lowering of BUILD_VECTOR makes that nearly impossible until the old
10212 // vector shuffle lowering is dead.
10213 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
10215 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
10216 // We need to zext the scalar if it is smaller than an i32.
10217 V2S = DAG.getBitcast(EltVT, V2S);
10218 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
10219 // Using zext to expand a narrow element won't work for non-zero
10224 // Zero-extend directly to i32.
10225 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
10226 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
10228 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
10229 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
10230 EltVT == MVT::i16) {
10231 // Either not inserting from the low element of the input or the input
10232 // element size is too small to use VZEXT_MOVL to clear the high bits.
10236 if (!IsV1Zeroable) {
10237 // If V1 can't be treated as a zero vector we have fewer options to lower
10238 // this. We can't support integer vectors or non-zero targets cheaply, and
10239 // the V1 elements can't be permuted in any way.
10240 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
10241 if (!VT.isFloatingPoint() || V2Index != 0)
10243 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
10244 V1Mask[V2Index] = -1;
10245 if (!isNoopShuffleMask(V1Mask))
10247 if (!VT.is128BitVector())
10250 // Otherwise, use MOVSD or MOVSS.
10251 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
10252 "Only two types of floating point element types to handle!");
10253 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
10257 // This lowering only works for the low element with floating point vectors.
10258 if (VT.isFloatingPoint() && V2Index != 0)
10261 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
10263 V2 = DAG.getBitcast(VT, V2);
10265 if (V2Index != 0) {
10266 // If we have 4 or fewer lanes we can cheaply shuffle the element into
10267 // the desired position. Otherwise it is more efficient to do a vector
10268 // shift left. We know that we can do a vector shift left because all
10269 // the inputs are zero.
10270 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
10271 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
10272 V2Shuffle[V2Index] = 0;
10273 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
10275 V2 = DAG.getBitcast(MVT::v16i8, V2);
10277 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
10278 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
10279 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
10280 DAG.getDataLayout(), VT)));
10281 V2 = DAG.getBitcast(VT, V2);
10287 /// Try to lower broadcast of a single - truncated - integer element,
10288 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
10290 /// This assumes we have AVX2.
10291 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
10292 SDValue V0, int BroadcastIdx,
10293 const X86Subtarget &Subtarget,
10294 SelectionDAG &DAG) {
10295 assert(Subtarget.hasAVX2() &&
10296 "We can only lower integer broadcasts with AVX2!");
10298 EVT EltVT = VT.getVectorElementType();
10299 EVT V0VT = V0.getValueType();
10301 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
10302 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
10304 EVT V0EltVT = V0VT.getVectorElementType();
10305 if (!V0EltVT.isInteger())
10308 const unsigned EltSize = EltVT.getSizeInBits();
10309 const unsigned V0EltSize = V0EltVT.getSizeInBits();
10311 // This is only a truncation if the original element type is larger.
10312 if (V0EltSize <= EltSize)
10315 assert(((V0EltSize % EltSize) == 0) &&
10316 "Scalar type sizes must all be powers of 2 on x86!");
10318 const unsigned V0Opc = V0.getOpcode();
10319 const unsigned Scale = V0EltSize / EltSize;
10320 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
10322 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
10323 V0Opc != ISD::BUILD_VECTOR)
10326 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
10328 // If we're extracting non-least-significant bits, shift so we can truncate.
10329 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
10330 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
10331 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
10332 if (const int OffsetIdx = BroadcastIdx % Scale)
10333 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
10334 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
10336 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
10337 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
10340 /// \brief Try to lower broadcast of a single element.
10342 /// For convenience, this code also bundles all of the subtarget feature set
10343 /// filtering. While a little annoying to re-dispatch on type here, there isn't
10344 /// a convenient way to factor it out.
10345 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
10346 SDValue V1, SDValue V2,
10347 ArrayRef<int> Mask,
10348 const X86Subtarget &Subtarget,
10349 SelectionDAG &DAG) {
10350 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10351 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10352 (Subtarget.hasAVX2() && VT.isInteger())))
10355 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10356 // we can only broadcast from a register with AVX2.
10357 unsigned NumElts = Mask.size();
10358 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
10360 : X86ISD::VBROADCAST;
10361 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10363 // Check that the mask is a broadcast.
10364 int BroadcastIdx = -1;
10365 for (int i = 0; i != (int)NumElts; ++i) {
10366 SmallVector<int, 8> BroadcastMask(NumElts, i);
10367 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10373 if (BroadcastIdx < 0)
10375 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10376 "a sorted mask where the broadcast "
10379 // Go up the chain of (vector) values to find a scalar load that we can
10380 // combine with the broadcast.
10383 switch (V.getOpcode()) {
10384 case ISD::BITCAST: {
10385 // Peek through bitcasts as long as BroadcastIdx can be adjusted.
10386 SDValue VSrc = V.getOperand(0);
10387 unsigned NumEltBits = V.getScalarValueSizeInBits();
10388 unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
10389 if ((NumEltBits % NumSrcBits) == 0)
10390 BroadcastIdx *= (NumEltBits / NumSrcBits);
10391 else if ((NumSrcBits % NumEltBits) == 0 &&
10392 (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
10393 BroadcastIdx /= (NumSrcBits / NumEltBits);
10399 case ISD::CONCAT_VECTORS: {
10400 int OperandSize = Mask.size() / V.getNumOperands();
10401 V = V.getOperand(BroadcastIdx / OperandSize);
10402 BroadcastIdx %= OperandSize;
10405 case ISD::INSERT_SUBVECTOR: {
10406 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10407 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10411 int BeginIdx = (int)ConstantIdx->getZExtValue();
10413 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10414 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10415 BroadcastIdx -= BeginIdx;
10426 // Ensure the source vector and BroadcastIdx are for a suitable type.
10427 if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
10428 unsigned NumEltBits = VT.getScalarSizeInBits();
10429 unsigned NumSrcBits = V.getScalarValueSizeInBits();
10430 if ((NumSrcBits % NumEltBits) == 0)
10431 BroadcastIdx *= (NumSrcBits / NumEltBits);
10432 else if ((NumEltBits % NumSrcBits) == 0 &&
10433 (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
10434 BroadcastIdx /= (NumEltBits / NumSrcBits);
10438 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
10439 MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
10440 V = DAG.getBitcast(SrcVT, V);
10443 // Check if this is a broadcast of a scalar. We special case lowering
10444 // for scalars so that we can more effectively fold with loads.
10445 // First, look through bitcast: if the original value has a larger element
10446 // type than the shuffle, the broadcast element is in essence truncated.
10447 // Make that explicit to ease folding.
10448 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10449 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10450 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10451 return TruncBroadcast;
10453 MVT BroadcastVT = VT;
10455 // Peek through any bitcast (only useful for loads).
10456 SDValue BC = peekThroughBitcasts(V);
10458 // Also check the simpler case, where we can directly reuse the scalar.
10459 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10460 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10461 V = V.getOperand(BroadcastIdx);
10463 // If we can't broadcast from a register, check that the input is a load.
10464 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10466 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10467 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10468 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10469 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10470 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
10475 // If we are broadcasting a load that is only used by the shuffle
10476 // then we can reduce the vector load to the broadcasted scalar load.
10477 LoadSDNode *Ld = cast<LoadSDNode>(BC);
10478 SDValue BaseAddr = Ld->getOperand(1);
10479 EVT SVT = BroadcastVT.getScalarType();
10480 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10481 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10482 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10483 DAG.getMachineFunction().getMachineMemOperand(
10484 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10485 DAG.makeEquivalentMemoryOrdering(Ld, V);
10486 } else if (!BroadcastFromReg) {
10487 // We can't broadcast from a vector register.
10489 } else if (BroadcastIdx != 0) {
10490 // We can only broadcast from the zero-element of a vector register,
10491 // but it can be advantageous to broadcast from the zero-element of a
10493 if (!VT.is256BitVector() && !VT.is512BitVector())
10496 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10497 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10500 // Only broadcast the zero-element of a 128-bit subvector.
10501 unsigned EltSize = VT.getScalarSizeInBits();
10502 if (((BroadcastIdx * EltSize) % 128) != 0)
10505 // The shuffle input might have been a bitcast we looked through; look at
10506 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
10507 // later bitcast it to BroadcastVT.
10508 assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10509 "Unexpected vector element size");
10510 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
10511 "Unexpected vector size");
10512 V = extract128BitVector(V, BroadcastIdx, DAG, DL);
10515 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10516 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10517 DAG.getBitcast(MVT::f64, V));
10519 // Bitcast back to the same scalar type as BroadcastVT.
10520 MVT SrcVT = V.getSimpleValueType();
10521 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10522 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10523 "Unexpected vector element size");
10524 if (SrcVT.isVector()) {
10525 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10526 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10528 SrcVT = BroadcastVT.getScalarType();
10530 V = DAG.getBitcast(SrcVT, V);
10533 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10534 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10535 V = DAG.getBitcast(MVT::f64, V);
10536 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10537 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10540 // We only support broadcasting from 128-bit vectors to minimize the
10541 // number of patterns we need to deal with in isel. So extract down to
10542 // 128-bits, removing as many bitcasts as possible.
10543 if (SrcVT.getSizeInBits() > 128) {
10544 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
10545 128 / SrcVT.getScalarSizeInBits());
10546 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
10547 V = DAG.getBitcast(ExtVT, V);
10550 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10553 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10554 // INSERTPS when the V1 elements are already in the correct locations
10555 // because otherwise we can just always use two SHUFPS instructions which
10556 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10557 // perform INSERTPS if a single V1 element is out of place and all V2
10558 // elements are zeroable.
10559 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10560 unsigned &InsertPSMask,
10561 const APInt &Zeroable,
10562 ArrayRef<int> Mask,
10563 SelectionDAG &DAG) {
10564 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10565 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10566 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10568 // Attempt to match INSERTPS with one element from VA or VB being
10569 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10571 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10572 ArrayRef<int> CandidateMask) {
10573 unsigned ZMask = 0;
10574 int VADstIndex = -1;
10575 int VBDstIndex = -1;
10576 bool VAUsedInPlace = false;
10578 for (int i = 0; i < 4; ++i) {
10579 // Synthesize a zero mask from the zeroable elements (includes undefs).
10585 // Flag if we use any VA inputs in place.
10586 if (i == CandidateMask[i]) {
10587 VAUsedInPlace = true;
10591 // We can only insert a single non-zeroable element.
10592 if (VADstIndex >= 0 || VBDstIndex >= 0)
10595 if (CandidateMask[i] < 4) {
10596 // VA input out of place for insertion.
10599 // VB input for insertion.
10604 // Don't bother if we have no (non-zeroable) element for insertion.
10605 if (VADstIndex < 0 && VBDstIndex < 0)
10608 // Determine element insertion src/dst indices. The src index is from the
10609 // start of the inserted vector, not the start of the concatenated vector.
10610 unsigned VBSrcIndex = 0;
10611 if (VADstIndex >= 0) {
10612 // If we have a VA input out of place, we use VA as the V2 element
10613 // insertion and don't use the original V2 at all.
10614 VBSrcIndex = CandidateMask[VADstIndex];
10615 VBDstIndex = VADstIndex;
10618 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10621 // If no V1 inputs are used in place, then the result is created only from
10622 // the zero mask and the V2 insertion - so remove V1 dependency.
10623 if (!VAUsedInPlace)
10624 VA = DAG.getUNDEF(MVT::v4f32);
10626 // Update V1, V2 and InsertPSMask accordingly.
10630 // Insert the V2 element into the desired position.
10631 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10632 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10636 if (matchAsInsertPS(V1, V2, Mask))
10639 // Commute and try again.
10640 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10641 ShuffleVectorSDNode::commuteMask(CommutedMask);
10642 if (matchAsInsertPS(V2, V1, CommutedMask))
10648 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10649 SDValue V2, ArrayRef<int> Mask,
10650 const APInt &Zeroable,
10651 SelectionDAG &DAG) {
10652 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10653 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10655 // Attempt to match the insertps pattern.
10656 unsigned InsertPSMask;
10657 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10660 // Insert the V2 element into the desired position.
10661 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10662 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10665 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10666 /// UNPCK instruction.
10668 /// This specifically targets cases where we end up with alternating between
10669 /// the two inputs, and so can permute them into something that feeds a single
10670 /// UNPCK instruction. Note that this routine only targets integer vectors
10671 /// because for floating point vectors we have a generalized SHUFPS lowering
10672 /// strategy that handles everything that doesn't *exactly* match an unpack,
10673 /// making this clever lowering unnecessary.
10674 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10675 SDValue V1, SDValue V2,
10676 ArrayRef<int> Mask,
10677 SelectionDAG &DAG) {
10678 assert(!VT.isFloatingPoint() &&
10679 "This routine only supports integer vectors.");
10680 assert(VT.is128BitVector() &&
10681 "This routine only works on 128-bit vectors.");
10682 assert(!V2.isUndef() &&
10683 "This routine should only be used when blending two inputs.");
10684 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10686 int Size = Mask.size();
10689 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10691 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10693 bool UnpackLo = NumLoInputs >= NumHiInputs;
10695 auto TryUnpack = [&](int ScalarSize, int Scale) {
10696 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10697 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10699 for (int i = 0; i < Size; ++i) {
10703 // Each element of the unpack contains Scale elements from this mask.
10704 int UnpackIdx = i / Scale;
10706 // We only handle the case where V1 feeds the first slots of the unpack.
10707 // We rely on canonicalization to ensure this is the case.
10708 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10711 // Setup the mask for this input. The indexing is tricky as we have to
10712 // handle the unpack stride.
10713 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10714 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10718 // If we will have to shuffle both inputs to use the unpack, check whether
10719 // we can just unpack first and shuffle the result. If so, skip this unpack.
10720 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10721 !isNoopShuffleMask(V2Mask))
10724 // Shuffle the inputs into place.
10725 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10726 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10728 // Cast the inputs to the type we will use to unpack them.
10729 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10730 V1 = DAG.getBitcast(UnpackVT, V1);
10731 V2 = DAG.getBitcast(UnpackVT, V2);
10733 // Unpack the inputs and cast the result back to the desired type.
10734 return DAG.getBitcast(
10735 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10736 UnpackVT, V1, V2));
10739 // We try each unpack from the largest to the smallest to try and find one
10740 // that fits this mask.
10741 int OrigScalarSize = VT.getScalarSizeInBits();
10742 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10743 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10746 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10748 if (NumLoInputs == 0 || NumHiInputs == 0) {
10749 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10750 "We have to have *some* inputs!");
10751 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10753 // FIXME: We could consider the total complexity of the permute of each
10754 // possible unpacking. Or at the least we should consider how many
10755 // half-crossings are created.
10756 // FIXME: We could consider commuting the unpacks.
10758 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10759 for (int i = 0; i < Size; ++i) {
10763 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10766 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10768 return DAG.getVectorShuffle(
10769 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10771 DAG.getUNDEF(VT), PermMask);
10777 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10779 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10780 /// support for floating point shuffles but not integer shuffles. These
10781 /// instructions will incur a domain crossing penalty on some chips though so
10782 /// it is better to avoid lowering through this for integer vectors where
10784 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10785 const APInt &Zeroable,
10786 SDValue V1, SDValue V2,
10787 const X86Subtarget &Subtarget,
10788 SelectionDAG &DAG) {
10789 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10790 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10791 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10793 if (V2.isUndef()) {
10794 // Check for being able to broadcast a single element.
10795 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10796 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10799 // Straight shuffle of a single input vector. Simulate this by using the
10800 // single input as both of the "inputs" to this instruction..
10801 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10803 if (Subtarget.hasAVX()) {
10804 // If we have AVX, we can use VPERMILPS which will allow folding a load
10805 // into the shuffle.
10806 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10807 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10810 return DAG.getNode(
10811 X86ISD::SHUFP, DL, MVT::v2f64,
10812 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10813 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10814 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10816 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10817 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10819 // If we have a single input, insert that into V1 if we can do so cheaply.
10820 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10821 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10822 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10824 // Try inverting the insertion since for v2 masks it is easy to do and we
10825 // can't reliably sort the mask one way or the other.
10826 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10827 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10828 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10829 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10833 // Try to use one of the special instruction patterns to handle two common
10834 // blend patterns if a zero-blend above didn't work.
10835 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10836 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10837 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10838 // We can either use a special instruction to load over the low double or
10839 // to move just the low double.
10840 return DAG.getNode(
10841 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10842 DL, MVT::v2f64, V2,
10843 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10845 if (Subtarget.hasSSE41())
10846 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10847 Zeroable, Subtarget, DAG))
10850 // Use dedicated unpack instructions for masks that match their pattern.
10852 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10855 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10856 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10857 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10860 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10862 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10863 /// the integer unit to minimize domain crossing penalties. However, for blends
10864 /// it falls back to the floating point shuffle operation with appropriate bit
10866 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10867 const APInt &Zeroable,
10868 SDValue V1, SDValue V2,
10869 const X86Subtarget &Subtarget,
10870 SelectionDAG &DAG) {
10871 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10872 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10873 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10875 if (V2.isUndef()) {
10876 // Check for being able to broadcast a single element.
10877 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10878 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10881 // Straight shuffle of a single input vector. For everything from SSE2
10882 // onward this has a single fast instruction with no scary immediates.
10883 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10884 V1 = DAG.getBitcast(MVT::v4i32, V1);
10885 int WidenedMask[4] = {
10886 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10887 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10888 return DAG.getBitcast(
10890 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10891 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10893 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10894 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10895 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10896 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10898 // Try to use shift instructions.
10899 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10900 Zeroable, Subtarget, DAG))
10903 // When loading a scalar and then shuffling it into a vector we can often do
10904 // the insertion cheaply.
10905 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10906 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10908 // Try inverting the insertion since for v2 masks it is easy to do and we
10909 // can't reliably sort the mask one way or the other.
10910 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10911 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10912 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10915 // We have different paths for blend lowering, but they all must use the
10916 // *exact* same predicate.
10917 bool IsBlendSupported = Subtarget.hasSSE41();
10918 if (IsBlendSupported)
10919 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10920 Zeroable, Subtarget, DAG))
10923 // Use dedicated unpack instructions for masks that match their pattern.
10925 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10928 // Try to use byte rotation instructions.
10929 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10930 if (Subtarget.hasSSSE3()) {
10931 if (Subtarget.hasVLX())
10932 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
10933 Mask, Subtarget, DAG))
10936 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10937 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10941 // If we have direct support for blends, we should lower by decomposing into
10942 // a permute. That will be faster than the domain cross.
10943 if (IsBlendSupported)
10944 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10947 // We implement this with SHUFPD which is pretty lame because it will likely
10948 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10949 // However, all the alternatives are still more cycles and newer chips don't
10950 // have this problem. It would be really nice if x86 had better shuffles here.
10951 V1 = DAG.getBitcast(MVT::v2f64, V1);
10952 V2 = DAG.getBitcast(MVT::v2f64, V2);
10953 return DAG.getBitcast(MVT::v2i64,
10954 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10957 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10959 /// This is used to disable more specialized lowerings when the shufps lowering
10960 /// will happen to be efficient.
10961 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10962 // This routine only handles 128-bit shufps.
10963 assert(Mask.size() == 4 && "Unsupported mask size!");
10964 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10965 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10966 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10967 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10969 // To lower with a single SHUFPS we need to have the low half and high half
10970 // each requiring a single input.
10971 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10973 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10979 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10981 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10982 /// It makes no assumptions about whether this is the *best* lowering, it simply
10984 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10985 ArrayRef<int> Mask, SDValue V1,
10986 SDValue V2, SelectionDAG &DAG) {
10987 SDValue LowV = V1, HighV = V2;
10988 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10990 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10992 if (NumV2Elements == 1) {
10993 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10995 // Compute the index adjacent to V2Index and in the same half by toggling
10997 int V2AdjIndex = V2Index ^ 1;
10999 if (Mask[V2AdjIndex] < 0) {
11000 // Handles all the cases where we have a single V2 element and an undef.
11001 // This will only ever happen in the high lanes because we commute the
11002 // vector otherwise.
11004 std::swap(LowV, HighV);
11005 NewMask[V2Index] -= 4;
11007 // Handle the case where the V2 element ends up adjacent to a V1 element.
11008 // To make this work, blend them together as the first step.
11009 int V1Index = V2AdjIndex;
11010 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
11011 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11012 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11014 // Now proceed to reconstruct the final blend as we have the necessary
11015 // high or low half formed.
11022 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
11023 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
11025 } else if (NumV2Elements == 2) {
11026 if (Mask[0] < 4 && Mask[1] < 4) {
11027 // Handle the easy case where we have V1 in the low lanes and V2 in the
11031 } else if (Mask[2] < 4 && Mask[3] < 4) {
11032 // We also handle the reversed case because this utility may get called
11033 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
11034 // arrange things in the right direction.
11040 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
11041 // trying to place elements directly, just blend them and set up the final
11042 // shuffle to place them.
11044 // The first two blend mask elements are for V1, the second two are for
11046 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
11047 Mask[2] < 4 ? Mask[2] : Mask[3],
11048 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
11049 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
11050 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11051 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11053 // Now we do a normal shuffle of V1 by giving V1 as both operands to
11056 NewMask[0] = Mask[0] < 4 ? 0 : 2;
11057 NewMask[1] = Mask[0] < 4 ? 2 : 0;
11058 NewMask[2] = Mask[2] < 4 ? 1 : 3;
11059 NewMask[3] = Mask[2] < 4 ? 3 : 1;
11062 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
11063 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
11066 /// \brief Lower 4-lane 32-bit floating point shuffles.
11068 /// Uses instructions exclusively from the floating point unit to minimize
11069 /// domain crossing penalties, as these are sufficient to implement all v4f32
11071 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11072 const APInt &Zeroable,
11073 SDValue V1, SDValue V2,
11074 const X86Subtarget &Subtarget,
11075 SelectionDAG &DAG) {
11076 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11077 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11078 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11080 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11082 if (NumV2Elements == 0) {
11083 // Check for being able to broadcast a single element.
11084 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11085 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
11088 // Use even/odd duplicate instructions for masks that match their pattern.
11089 if (Subtarget.hasSSE3()) {
11090 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11091 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
11092 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
11093 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
11096 if (Subtarget.hasAVX()) {
11097 // If we have AVX, we can use VPERMILPS which will allow folding a load
11098 // into the shuffle.
11099 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
11100 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11103 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
11104 // in SSE1 because otherwise they are widened to v2f64 and never get here.
11105 if (!Subtarget.hasSSE2()) {
11106 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
11107 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
11108 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
11109 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
11112 // Otherwise, use a straight shuffle of a single input vector. We pass the
11113 // input vector to both operands to simulate this with a SHUFPS.
11114 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
11115 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11118 // There are special ways we can lower some single-element blends. However, we
11119 // have custom ways we can lower more complex single-element blends below that
11120 // we defer to if both this and BLENDPS fail to match, so restrict this to
11121 // when the V2 input is targeting element 0 of the mask -- that is the fast
11123 if (NumV2Elements == 1 && Mask[0] >= 4)
11124 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11125 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11128 if (Subtarget.hasSSE41()) {
11129 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
11130 Zeroable, Subtarget, DAG))
11133 // Use INSERTPS if we can complete the shuffle efficiently.
11135 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
11138 if (!isSingleSHUFPSMask(Mask))
11139 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
11140 DL, MVT::v4f32, V1, V2, Mask, DAG))
11144 // Use low/high mov instructions. These are only valid in SSE1 because
11145 // otherwise they are widened to v2f64 and never get here.
11146 if (!Subtarget.hasSSE2()) {
11147 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
11148 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
11149 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
11150 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
11153 // Use dedicated unpack instructions for masks that match their pattern.
11155 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
11158 // Otherwise fall back to a SHUFPS lowering strategy.
11159 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
11162 /// \brief Lower 4-lane i32 vector shuffles.
11164 /// We try to handle these with integer-domain shuffles where we can, but for
11165 /// blends we use the floating point domain blend instructions.
11166 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11167 const APInt &Zeroable,
11168 SDValue V1, SDValue V2,
11169 const X86Subtarget &Subtarget,
11170 SelectionDAG &DAG) {
11171 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11172 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11173 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11175 // Whenever we can lower this as a zext, that instruction is strictly faster
11176 // than any alternative. It also allows us to fold memory operands into the
11177 // shuffle in many cases.
11178 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11179 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11182 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11184 if (NumV2Elements == 0) {
11185 // Check for being able to broadcast a single element.
11186 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11187 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11190 // Straight shuffle of a single input vector. For everything from SSE2
11191 // onward this has a single fast instruction with no scary immediates.
11192 // We coerce the shuffle pattern to be compatible with UNPCK instructions
11193 // but we aren't actually going to use the UNPCK instruction because doing
11194 // so prevents folding a load into this instruction or making a copy.
11195 const int UnpackLoMask[] = {0, 0, 1, 1};
11196 const int UnpackHiMask[] = {2, 2, 3, 3};
11197 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
11198 Mask = UnpackLoMask;
11199 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
11200 Mask = UnpackHiMask;
11202 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11203 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11206 // Try to use shift instructions.
11207 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
11208 Zeroable, Subtarget, DAG))
11211 // There are special ways we can lower some single-element blends.
11212 if (NumV2Elements == 1)
11213 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11214 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11217 // We have different paths for blend lowering, but they all must use the
11218 // *exact* same predicate.
11219 bool IsBlendSupported = Subtarget.hasSSE41();
11220 if (IsBlendSupported)
11221 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
11222 Zeroable, Subtarget, DAG))
11225 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
11229 // Use dedicated unpack instructions for masks that match their pattern.
11231 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
11234 // Try to use byte rotation instructions.
11235 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11236 if (Subtarget.hasSSSE3()) {
11237 if (Subtarget.hasVLX())
11238 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
11239 Mask, Subtarget, DAG))
11242 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11243 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11247 // Assume that a single SHUFPS is faster than an alternative sequence of
11248 // multiple instructions (even if the CPU has a domain penalty).
11249 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
11250 if (!isSingleSHUFPSMask(Mask)) {
11251 // If we have direct support for blends, we should lower by decomposing into
11252 // a permute. That will be faster than the domain cross.
11253 if (IsBlendSupported)
11254 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
11257 // Try to lower by permuting the inputs into an unpack instruction.
11258 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11259 DL, MVT::v4i32, V1, V2, Mask, DAG))
11263 // We implement this with SHUFPS because it can blend from two vectors.
11264 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
11265 // up the inputs, bypassing domain shift penalties that we would incur if we
11266 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
11268 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
11269 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
11270 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
11271 return DAG.getBitcast(MVT::v4i32, ShufPS);
11274 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
11275 /// shuffle lowering, and the most complex part.
11277 /// The lowering strategy is to try to form pairs of input lanes which are
11278 /// targeted at the same half of the final vector, and then use a dword shuffle
11279 /// to place them onto the right half, and finally unpack the paired lanes into
11280 /// their final position.
11282 /// The exact breakdown of how to form these dword pairs and align them on the
11283 /// correct sides is really tricky. See the comments within the function for
11284 /// more of the details.
11286 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
11287 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
11288 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
11289 /// vector, form the analogous 128-bit 8-element Mask.
11290 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
11291 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
11292 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11293 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
11294 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
11296 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
11297 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
11298 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
11300 // Attempt to directly match PSHUFLW or PSHUFHW.
11301 if (isUndefOrInRange(LoMask, 0, 4) &&
11302 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
11303 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11304 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11306 if (isUndefOrInRange(HiMask, 4, 8) &&
11307 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
11308 for (int i = 0; i != 4; ++i)
11309 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
11310 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11311 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11314 SmallVector<int, 4> LoInputs;
11315 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
11316 std::sort(LoInputs.begin(), LoInputs.end());
11317 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
11318 SmallVector<int, 4> HiInputs;
11319 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
11320 std::sort(HiInputs.begin(), HiInputs.end());
11321 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
11323 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
11324 int NumHToL = LoInputs.size() - NumLToL;
11326 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
11327 int NumHToH = HiInputs.size() - NumLToH;
11328 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
11329 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
11330 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
11331 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
11333 // If we are shuffling values from one half - check how many different DWORD
11334 // pairs we need to create. If only 1 or 2 then we can perform this as a
11335 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
11336 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
11337 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
11338 V = DAG.getNode(ShufWOp, DL, VT, V,
11339 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11340 V = DAG.getBitcast(PSHUFDVT, V);
11341 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
11342 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11343 return DAG.getBitcast(VT, V);
11346 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
11347 int PSHUFDMask[4] = { -1, -1, -1, -1 };
11348 SmallVector<std::pair<int, int>, 4> DWordPairs;
11349 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
11351 // Collect the different DWORD pairs.
11352 for (int DWord = 0; DWord != 4; ++DWord) {
11353 int M0 = Mask[2 * DWord + 0];
11354 int M1 = Mask[2 * DWord + 1];
11355 M0 = (M0 >= 0 ? M0 % 4 : M0);
11356 M1 = (M1 >= 0 ? M1 % 4 : M1);
11357 if (M0 < 0 && M1 < 0)
11360 bool Match = false;
11361 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
11362 auto &DWordPair = DWordPairs[j];
11363 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
11364 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
11365 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
11366 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
11367 PSHUFDMask[DWord] = DOffset + j;
11373 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
11374 DWordPairs.push_back(std::make_pair(M0, M1));
11378 if (DWordPairs.size() <= 2) {
11379 DWordPairs.resize(2, std::make_pair(-1, -1));
11380 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
11381 DWordPairs[1].first, DWordPairs[1].second};
11382 if ((NumHToL + NumHToH) == 0)
11383 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
11384 if ((NumLToL + NumLToH) == 0)
11385 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
11389 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
11390 // such inputs we can swap two of the dwords across the half mark and end up
11391 // with <=2 inputs to each half in each half. Once there, we can fall through
11392 // to the generic code below. For example:
11394 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11395 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
11397 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
11398 // and an existing 2-into-2 on the other half. In this case we may have to
11399 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
11400 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
11401 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
11402 // because any other situation (including a 3-into-1 or 1-into-3 in the other
11403 // half than the one we target for fixing) will be fixed when we re-enter this
11404 // path. We will also combine away any sequence of PSHUFD instructions that
11405 // result into a single instruction. Here is an example of the tricky case:
11407 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11408 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
11410 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
11412 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
11413 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
11415 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
11416 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
11418 // The result is fine to be handled by the generic logic.
11419 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
11420 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
11421 int AOffset, int BOffset) {
11422 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
11423 "Must call this with A having 3 or 1 inputs from the A half.");
11424 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
11425 "Must call this with B having 1 or 3 inputs from the B half.");
11426 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
11427 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
11429 bool ThreeAInputs = AToAInputs.size() == 3;
11431 // Compute the index of dword with only one word among the three inputs in
11432 // a half by taking the sum of the half with three inputs and subtracting
11433 // the sum of the actual three inputs. The difference is the remaining
11435 int ADWord, BDWord;
11436 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11437 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11438 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11439 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11440 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11441 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11442 int TripleNonInputIdx =
11443 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11444 TripleDWord = TripleNonInputIdx / 2;
11446 // We use xor with one to compute the adjacent DWord to whichever one the
11448 OneInputDWord = (OneInput / 2) ^ 1;
11450 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11451 // and BToA inputs. If there is also such a problem with the BToB and AToB
11452 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11453 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11454 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11455 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11456 // Compute how many inputs will be flipped by swapping these DWords. We
11458 // to balance this to ensure we don't form a 3-1 shuffle in the other
11460 int NumFlippedAToBInputs =
11461 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11462 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11463 int NumFlippedBToBInputs =
11464 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11465 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11466 if ((NumFlippedAToBInputs == 1 &&
11467 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11468 (NumFlippedBToBInputs == 1 &&
11469 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11470 // We choose whether to fix the A half or B half based on whether that
11471 // half has zero flipped inputs. At zero, we may not be able to fix it
11472 // with that half. We also bias towards fixing the B half because that
11473 // will more commonly be the high half, and we have to bias one way.
11474 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11475 ArrayRef<int> Inputs) {
11476 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11477 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11478 // Determine whether the free index is in the flipped dword or the
11479 // unflipped dword based on where the pinned index is. We use this bit
11480 // in an xor to conditionally select the adjacent dword.
11481 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11482 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11483 if (IsFixIdxInput == IsFixFreeIdxInput)
11485 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11486 assert(IsFixIdxInput != IsFixFreeIdxInput &&
11487 "We need to be changing the number of flipped inputs!");
11488 int PSHUFHalfMask[] = {0, 1, 2, 3};
11489 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11491 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11492 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11493 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11495 for (int &M : Mask)
11496 if (M >= 0 && M == FixIdx)
11498 else if (M >= 0 && M == FixFreeIdx)
11501 if (NumFlippedBToBInputs != 0) {
11503 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11504 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11506 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
11507 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11508 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11513 int PSHUFDMask[] = {0, 1, 2, 3};
11514 PSHUFDMask[ADWord] = BDWord;
11515 PSHUFDMask[BDWord] = ADWord;
11516 V = DAG.getBitcast(
11518 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11519 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11521 // Adjust the mask to match the new locations of A and B.
11522 for (int &M : Mask)
11523 if (M >= 0 && M/2 == ADWord)
11524 M = 2 * BDWord + M % 2;
11525 else if (M >= 0 && M/2 == BDWord)
11526 M = 2 * ADWord + M % 2;
11528 // Recurse back into this routine to re-compute state now that this isn't
11529 // a 3 and 1 problem.
11530 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11533 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
11534 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11535 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
11536 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11538 // At this point there are at most two inputs to the low and high halves from
11539 // each half. That means the inputs can always be grouped into dwords and
11540 // those dwords can then be moved to the correct half with a dword shuffle.
11541 // We use at most one low and one high word shuffle to collect these paired
11542 // inputs into dwords, and finally a dword shuffle to place them.
11543 int PSHUFLMask[4] = {-1, -1, -1, -1};
11544 int PSHUFHMask[4] = {-1, -1, -1, -1};
11545 int PSHUFDMask[4] = {-1, -1, -1, -1};
11547 // First fix the masks for all the inputs that are staying in their
11548 // original halves. This will then dictate the targets of the cross-half
11550 auto fixInPlaceInputs =
11551 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11552 MutableArrayRef<int> SourceHalfMask,
11553 MutableArrayRef<int> HalfMask, int HalfOffset) {
11554 if (InPlaceInputs.empty())
11556 if (InPlaceInputs.size() == 1) {
11557 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11558 InPlaceInputs[0] - HalfOffset;
11559 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11562 if (IncomingInputs.empty()) {
11563 // Just fix all of the in place inputs.
11564 for (int Input : InPlaceInputs) {
11565 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11566 PSHUFDMask[Input / 2] = Input / 2;
11571 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11572 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11573 InPlaceInputs[0] - HalfOffset;
11574 // Put the second input next to the first so that they are packed into
11575 // a dword. We find the adjacent index by toggling the low bit.
11576 int AdjIndex = InPlaceInputs[0] ^ 1;
11577 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11578 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11579 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11581 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11582 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11584 // Now gather the cross-half inputs and place them into a free dword of
11585 // their target half.
11586 // FIXME: This operation could almost certainly be simplified dramatically to
11587 // look more like the 3-1 fixing operation.
11588 auto moveInputsToRightHalf = [&PSHUFDMask](
11589 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11590 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11591 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11593 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11594 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11596 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11598 int LowWord = Word & ~1;
11599 int HighWord = Word | 1;
11600 return isWordClobbered(SourceHalfMask, LowWord) ||
11601 isWordClobbered(SourceHalfMask, HighWord);
11604 if (IncomingInputs.empty())
11607 if (ExistingInputs.empty()) {
11608 // Map any dwords with inputs from them into the right half.
11609 for (int Input : IncomingInputs) {
11610 // If the source half mask maps over the inputs, turn those into
11611 // swaps and use the swapped lane.
11612 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11613 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11614 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11615 Input - SourceOffset;
11616 // We have to swap the uses in our half mask in one sweep.
11617 for (int &M : HalfMask)
11618 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11620 else if (M == Input)
11621 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11623 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11624 Input - SourceOffset &&
11625 "Previous placement doesn't match!");
11627 // Note that this correctly re-maps both when we do a swap and when
11628 // we observe the other side of the swap above. We rely on that to
11629 // avoid swapping the members of the input list directly.
11630 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11633 // Map the input's dword into the correct half.
11634 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11635 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11637 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11639 "Previous placement doesn't match!");
11642 // And just directly shift any other-half mask elements to be same-half
11643 // as we will have mirrored the dword containing the element into the
11644 // same position within that half.
11645 for (int &M : HalfMask)
11646 if (M >= SourceOffset && M < SourceOffset + 4) {
11647 M = M - SourceOffset + DestOffset;
11648 assert(M >= 0 && "This should never wrap below zero!");
11653 // Ensure we have the input in a viable dword of its current half. This
11654 // is particularly tricky because the original position may be clobbered
11655 // by inputs being moved and *staying* in that half.
11656 if (IncomingInputs.size() == 1) {
11657 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11658 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11660 SourceHalfMask[InputFixed - SourceOffset] =
11661 IncomingInputs[0] - SourceOffset;
11662 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11664 IncomingInputs[0] = InputFixed;
11666 } else if (IncomingInputs.size() == 2) {
11667 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11668 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11669 // We have two non-adjacent or clobbered inputs we need to extract from
11670 // the source half. To do this, we need to map them into some adjacent
11671 // dword slot in the source mask.
11672 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11673 IncomingInputs[1] - SourceOffset};
11675 // If there is a free slot in the source half mask adjacent to one of
11676 // the inputs, place the other input in it. We use (Index XOR 1) to
11677 // compute an adjacent index.
11678 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11679 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11680 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11681 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11682 InputsFixed[1] = InputsFixed[0] ^ 1;
11683 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11684 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11685 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11686 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11687 InputsFixed[0] = InputsFixed[1] ^ 1;
11688 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11689 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11690 // The two inputs are in the same DWord but it is clobbered and the
11691 // adjacent DWord isn't used at all. Move both inputs to the free
11693 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11694 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11695 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11696 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11698 // The only way we hit this point is if there is no clobbering
11699 // (because there are no off-half inputs to this half) and there is no
11700 // free slot adjacent to one of the inputs. In this case, we have to
11701 // swap an input with a non-input.
11702 for (int i = 0; i < 4; ++i)
11703 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11704 "We can't handle any clobbers here!");
11705 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11706 "Cannot have adjacent inputs here!");
11708 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11709 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11711 // We also have to update the final source mask in this case because
11712 // it may need to undo the above swap.
11713 for (int &M : FinalSourceHalfMask)
11714 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11715 M = InputsFixed[1] + SourceOffset;
11716 else if (M == InputsFixed[1] + SourceOffset)
11717 M = (InputsFixed[0] ^ 1) + SourceOffset;
11719 InputsFixed[1] = InputsFixed[0] ^ 1;
11722 // Point everything at the fixed inputs.
11723 for (int &M : HalfMask)
11724 if (M == IncomingInputs[0])
11725 M = InputsFixed[0] + SourceOffset;
11726 else if (M == IncomingInputs[1])
11727 M = InputsFixed[1] + SourceOffset;
11729 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11730 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11733 llvm_unreachable("Unhandled input size!");
11736 // Now hoist the DWord down to the right half.
11737 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11738 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11739 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11740 for (int &M : HalfMask)
11741 for (int Input : IncomingInputs)
11743 M = FreeDWord * 2 + Input % 2;
11745 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11746 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11747 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11748 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11750 // Now enact all the shuffles we've computed to move the inputs into their
11752 if (!isNoopShuffleMask(PSHUFLMask))
11753 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11754 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11755 if (!isNoopShuffleMask(PSHUFHMask))
11756 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11757 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11758 if (!isNoopShuffleMask(PSHUFDMask))
11759 V = DAG.getBitcast(
11761 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11762 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11764 // At this point, each half should contain all its inputs, and we can then
11765 // just shuffle them into their final position.
11766 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11767 "Failed to lift all the high half inputs to the low mask!");
11768 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11769 "Failed to lift all the low half inputs to the high mask!");
11771 // Do a half shuffle for the low mask.
11772 if (!isNoopShuffleMask(LoMask))
11773 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11774 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11776 // Do a half shuffle with the high mask after shifting its values down.
11777 for (int &M : HiMask)
11780 if (!isNoopShuffleMask(HiMask))
11781 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11782 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11787 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11788 /// blend if only one input is used.
11789 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11790 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11791 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11793 SDValue V1Mask[16];
11794 SDValue V2Mask[16];
11798 int Size = Mask.size();
11799 int Scale = 16 / Size;
11800 for (int i = 0; i < 16; ++i) {
11801 if (Mask[i / Scale] < 0) {
11802 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11804 const int ZeroMask = 0x80;
11805 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11807 int V2Idx = Mask[i / Scale] < Size
11809 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11810 if (Zeroable[i / Scale])
11811 V1Idx = V2Idx = ZeroMask;
11812 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11813 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11814 V1InUse |= (ZeroMask != V1Idx);
11815 V2InUse |= (ZeroMask != V2Idx);
11820 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11821 DAG.getBitcast(MVT::v16i8, V1),
11822 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11824 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11825 DAG.getBitcast(MVT::v16i8, V2),
11826 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11828 // If we need shuffled inputs from both, blend the two.
11830 if (V1InUse && V2InUse)
11831 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11833 V = V1InUse ? V1 : V2;
11835 // Cast the result back to the correct type.
11836 return DAG.getBitcast(VT, V);
11839 /// \brief Generic lowering of 8-lane i16 shuffles.
11841 /// This handles both single-input shuffles and combined shuffle/blends with
11842 /// two inputs. The single input shuffles are immediately delegated to
11843 /// a dedicated lowering routine.
11845 /// The blends are lowered in one of three fundamental ways. If there are few
11846 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11847 /// of the input is significantly cheaper when lowered as an interleaving of
11848 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11849 /// halves of the inputs separately (making them have relatively few inputs)
11850 /// and then concatenate them.
11851 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11852 const APInt &Zeroable,
11853 SDValue V1, SDValue V2,
11854 const X86Subtarget &Subtarget,
11855 SelectionDAG &DAG) {
11856 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11857 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11858 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11860 // Whenever we can lower this as a zext, that instruction is strictly faster
11861 // than any alternative.
11862 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11863 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11866 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11868 if (NumV2Inputs == 0) {
11869 // Check for being able to broadcast a single element.
11870 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11871 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11874 // Try to use shift instructions.
11875 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11876 Zeroable, Subtarget, DAG))
11879 // Use dedicated unpack instructions for masks that match their pattern.
11881 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11884 // Use dedicated pack instructions for masks that match their pattern.
11885 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
11889 // Try to use byte rotation instructions.
11890 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11891 Mask, Subtarget, DAG))
11894 // Make a copy of the mask so it can be modified.
11895 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11896 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11897 MutableMask, Subtarget,
11901 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11902 "All single-input shuffles should be canonicalized to be V1-input "
11905 // Try to use shift instructions.
11906 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11907 Zeroable, Subtarget, DAG))
11910 // See if we can use SSE4A Extraction / Insertion.
11911 if (Subtarget.hasSSE4A())
11912 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11916 // There are special ways we can lower some single-element blends.
11917 if (NumV2Inputs == 1)
11918 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11919 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11922 // We have different paths for blend lowering, but they all must use the
11923 // *exact* same predicate.
11924 bool IsBlendSupported = Subtarget.hasSSE41();
11925 if (IsBlendSupported)
11926 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11927 Zeroable, Subtarget, DAG))
11930 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11934 // Use dedicated unpack instructions for masks that match their pattern.
11936 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11939 // Use dedicated pack instructions for masks that match their pattern.
11940 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
11944 // Try to use byte rotation instructions.
11945 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11946 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11949 if (SDValue BitBlend =
11950 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11953 // Try to lower by permuting the inputs into an unpack instruction.
11954 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11958 // If we can't directly blend but can use PSHUFB, that will be better as it
11959 // can both shuffle and set up the inefficient blend.
11960 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11961 bool V1InUse, V2InUse;
11962 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11963 Zeroable, DAG, V1InUse, V2InUse);
11966 // We can always bit-blend if we have to so the fallback strategy is to
11967 // decompose into single-input permutes and blends.
11968 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11972 /// \brief Check whether a compaction lowering can be done by dropping even
11973 /// elements and compute how many times even elements must be dropped.
11975 /// This handles shuffles which take every Nth element where N is a power of
11976 /// two. Example shuffle masks:
11978 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11979 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11980 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11981 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11982 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11983 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11985 /// Any of these lanes can of course be undef.
11987 /// This routine only supports N <= 3.
11988 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11991 /// \returns N above, or the number of times even elements must be dropped if
11992 /// there is such a number. Otherwise returns zero.
11993 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11994 bool IsSingleInput) {
11995 // The modulus for the shuffle vector entries is based on whether this is
11996 // a single input or not.
11997 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11998 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11999 "We should only be called with masks with a power-of-2 size!");
12001 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12003 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12004 // and 2^3 simultaneously. This is because we may have ambiguity with
12005 // partially undef inputs.
12006 bool ViableForN[3] = {true, true, true};
12008 for (int i = 0, e = Mask.size(); i < e; ++i) {
12009 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12014 bool IsAnyViable = false;
12015 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12016 if (ViableForN[j]) {
12017 uint64_t N = j + 1;
12019 // The shuffle mask must be equal to (i * 2^N) % M.
12020 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
12021 IsAnyViable = true;
12023 ViableForN[j] = false;
12025 // Early exit if we exhaust the possible powers of two.
12030 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12034 // Return 0 as there is no viable power of two.
12038 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12039 ArrayRef<int> Mask, SDValue V1,
12040 SDValue V2, SelectionDAG &DAG) {
12041 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12042 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12044 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12046 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12048 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12051 /// \brief Generic lowering of v16i8 shuffles.
12053 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
12054 /// detect any complexity reducing interleaving. If that doesn't help, it uses
12055 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
12056 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
12058 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12059 const APInt &Zeroable,
12060 SDValue V1, SDValue V2,
12061 const X86Subtarget &Subtarget,
12062 SelectionDAG &DAG) {
12063 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12064 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12065 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12067 // Try to use shift instructions.
12068 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
12069 Zeroable, Subtarget, DAG))
12072 // Try to use byte rotation instructions.
12073 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12074 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12077 // Use dedicated pack instructions for masks that match their pattern.
12078 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
12082 // Try to use a zext lowering.
12083 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12084 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12087 // See if we can use SSE4A Extraction / Insertion.
12088 if (Subtarget.hasSSE4A())
12089 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
12093 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
12095 // For single-input shuffles, there are some nicer lowering tricks we can use.
12096 if (NumV2Elements == 0) {
12097 // Check for being able to broadcast a single element.
12098 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12099 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12102 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
12103 // Notably, this handles splat and partial-splat shuffles more efficiently.
12104 // However, it only makes sense if the pre-duplication shuffle simplifies
12105 // things significantly. Currently, this means we need to be able to
12106 // express the pre-duplication shuffle as an i16 shuffle.
12108 // FIXME: We should check for other patterns which can be widened into an
12109 // i16 shuffle as well.
12110 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
12111 for (int i = 0; i < 16; i += 2)
12112 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
12117 auto tryToWidenViaDuplication = [&]() -> SDValue {
12118 if (!canWidenViaDuplication(Mask))
12120 SmallVector<int, 4> LoInputs;
12121 copy_if(Mask, std::back_inserter(LoInputs),
12122 [](int M) { return M >= 0 && M < 8; });
12123 std::sort(LoInputs.begin(), LoInputs.end());
12124 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
12126 SmallVector<int, 4> HiInputs;
12127 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
12128 std::sort(HiInputs.begin(), HiInputs.end());
12129 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
12132 bool TargetLo = LoInputs.size() >= HiInputs.size();
12133 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
12134 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
12136 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
12137 SmallDenseMap<int, int, 8> LaneMap;
12138 for (int I : InPlaceInputs) {
12139 PreDupI16Shuffle[I/2] = I/2;
12142 int j = TargetLo ? 0 : 4, je = j + 4;
12143 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
12144 // Check if j is already a shuffle of this input. This happens when
12145 // there are two adjacent bytes after we move the low one.
12146 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
12147 // If we haven't yet mapped the input, search for a slot into which
12149 while (j < je && PreDupI16Shuffle[j] >= 0)
12153 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
12156 // Map this input with the i16 shuffle.
12157 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
12160 // Update the lane map based on the mapping we ended up with.
12161 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
12163 V1 = DAG.getBitcast(
12165 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12166 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
12168 // Unpack the bytes to form the i16s that will be shuffled into place.
12169 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12170 MVT::v16i8, V1, V1);
12172 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
12173 for (int i = 0; i < 16; ++i)
12174 if (Mask[i] >= 0) {
12175 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
12176 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
12177 if (PostDupI16Shuffle[i / 2] < 0)
12178 PostDupI16Shuffle[i / 2] = MappedMask;
12180 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
12181 "Conflicting entries in the original shuffle!");
12183 return DAG.getBitcast(
12185 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12186 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
12188 if (SDValue V = tryToWidenViaDuplication())
12192 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
12196 // Use dedicated unpack instructions for masks that match their pattern.
12198 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
12201 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
12202 // with PSHUFB. It is important to do this before we attempt to generate any
12203 // blends but after all of the single-input lowerings. If the single input
12204 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12205 // want to preserve that and we can DAG combine any longer sequences into
12206 // a PSHUFB in the end. But once we start blending from multiple inputs,
12207 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12208 // and there are *very* few patterns that would actually be faster than the
12209 // PSHUFB approach because of its ability to zero lanes.
12211 // FIXME: The only exceptions to the above are blends which are exact
12212 // interleavings with direct instructions supporting them. We currently don't
12213 // handle those well here.
12214 if (Subtarget.hasSSSE3()) {
12215 bool V1InUse = false;
12216 bool V2InUse = false;
12218 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12219 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
12221 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
12222 // do so. This avoids using them to handle blends-with-zero which is
12223 // important as a single pshufb is significantly faster for that.
12224 if (V1InUse && V2InUse) {
12225 if (Subtarget.hasSSE41())
12226 if (SDValue Blend = lowerVectorShuffleAsBlend(
12227 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12230 // We can use an unpack to do the blending rather than an or in some
12231 // cases. Even though the or may be (very minorly) more efficient, we
12232 // preference this lowering because there are common cases where part of
12233 // the complexity of the shuffles goes away when we do the final blend as
12235 // FIXME: It might be worth trying to detect if the unpack-feeding
12236 // shuffles will both be pshufb, in which case we shouldn't bother with
12238 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12239 DL, MVT::v16i8, V1, V2, Mask, DAG))
12242 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
12243 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
12244 return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
12250 // There are special ways we can lower some single-element blends.
12251 if (NumV2Elements == 1)
12252 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12253 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12256 if (SDValue BitBlend =
12257 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
12260 // Check whether a compaction lowering can be done. This handles shuffles
12261 // which take every Nth element for some even N. See the helper function for
12264 // We special case these as they can be particularly efficiently handled with
12265 // the PACKUSB instruction on x86 and they show up in common patterns of
12266 // rearranging bytes to truncate wide elements.
12267 bool IsSingleInput = V2.isUndef();
12268 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
12269 // NumEvenDrops is the power of two stride of the elements. Another way of
12270 // thinking about it is that we need to drop the even elements this many
12271 // times to get the original input.
12273 // First we need to zero all the dropped bytes.
12274 assert(NumEvenDrops <= 3 &&
12275 "No support for dropping even elements more than 3 times.");
12276 // We use the mask type to pick which bytes are preserved based on how many
12277 // elements are dropped.
12278 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
12279 SDValue ByteClearMask = DAG.getBitcast(
12280 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
12281 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
12282 if (!IsSingleInput)
12283 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
12285 // Now pack things back together.
12286 V1 = DAG.getBitcast(MVT::v8i16, V1);
12287 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
12288 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
12289 for (int i = 1; i < NumEvenDrops; ++i) {
12290 Result = DAG.getBitcast(MVT::v8i16, Result);
12291 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
12297 // Handle multi-input cases by blending single-input shuffles.
12298 if (NumV2Elements > 0)
12299 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
12302 // The fallback path for single-input shuffles widens this into two v8i16
12303 // vectors with unpacks, shuffles those, and then pulls them back together
12307 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12308 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12309 for (int i = 0; i < 16; ++i)
12311 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
12313 SDValue VLoHalf, VHiHalf;
12314 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
12315 // them out and avoid using UNPCK{L,H} to extract the elements of V as
12317 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
12318 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
12319 // Use a mask to drop the high bytes.
12320 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
12321 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
12322 DAG.getConstant(0x00FF, DL, MVT::v8i16));
12324 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
12325 VHiHalf = DAG.getUNDEF(MVT::v8i16);
12327 // Squash the masks to point directly into VLoHalf.
12328 for (int &M : LoBlendMask)
12331 for (int &M : HiBlendMask)
12335 // Otherwise just unpack the low half of V into VLoHalf and the high half into
12336 // VHiHalf so that we can blend them as i16s.
12337 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
12339 VLoHalf = DAG.getBitcast(
12340 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
12341 VHiHalf = DAG.getBitcast(
12342 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
12345 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
12346 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
12348 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
12351 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
12353 /// This routine breaks down the specific type of 128-bit shuffle and
12354 /// dispatches to the lowering routines accordingly.
12355 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12356 MVT VT, SDValue V1, SDValue V2,
12357 const APInt &Zeroable,
12358 const X86Subtarget &Subtarget,
12359 SelectionDAG &DAG) {
12360 switch (VT.SimpleTy) {
12362 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12364 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12366 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12368 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12370 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12372 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12375 llvm_unreachable("Unimplemented!");
12379 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
12381 /// This routine just extracts two subvectors, shuffles them independently, and
12382 /// then concatenates them back together. This should work effectively with all
12383 /// AVX vector shuffle types.
12384 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12385 SDValue V2, ArrayRef<int> Mask,
12386 SelectionDAG &DAG) {
12387 assert(VT.getSizeInBits() >= 256 &&
12388 "Only for 256-bit or wider vector shuffles!");
12389 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
12390 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
12392 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
12393 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
12395 int NumElements = VT.getVectorNumElements();
12396 int SplitNumElements = NumElements / 2;
12397 MVT ScalarVT = VT.getVectorElementType();
12398 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
12400 // Rather than splitting build-vectors, just build two narrower build
12401 // vectors. This helps shuffling with splats and zeros.
12402 auto SplitVector = [&](SDValue V) {
12403 V = peekThroughBitcasts(V);
12405 MVT OrigVT = V.getSimpleValueType();
12406 int OrigNumElements = OrigVT.getVectorNumElements();
12407 int OrigSplitNumElements = OrigNumElements / 2;
12408 MVT OrigScalarVT = OrigVT.getVectorElementType();
12409 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
12413 auto *BV = dyn_cast<BuildVectorSDNode>(V);
12415 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12416 DAG.getIntPtrConstant(0, DL));
12417 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12418 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
12421 SmallVector<SDValue, 16> LoOps, HiOps;
12422 for (int i = 0; i < OrigSplitNumElements; ++i) {
12423 LoOps.push_back(BV->getOperand(i));
12424 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
12426 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
12427 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
12429 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
12430 DAG.getBitcast(SplitVT, HiV));
12433 SDValue LoV1, HiV1, LoV2, HiV2;
12434 std::tie(LoV1, HiV1) = SplitVector(V1);
12435 std::tie(LoV2, HiV2) = SplitVector(V2);
12437 // Now create two 4-way blends of these half-width vectors.
12438 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
12439 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
12440 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
12441 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
12442 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
12443 for (int i = 0; i < SplitNumElements; ++i) {
12444 int M = HalfMask[i];
12445 if (M >= NumElements) {
12446 if (M >= NumElements + SplitNumElements)
12450 V2BlendMask[i] = M - NumElements;
12451 BlendMask[i] = SplitNumElements + i;
12452 } else if (M >= 0) {
12453 if (M >= SplitNumElements)
12457 V1BlendMask[i] = M;
12462 // Because the lowering happens after all combining takes place, we need to
12463 // manually combine these blend masks as much as possible so that we create
12464 // a minimal number of high-level vector shuffle nodes.
12466 // First try just blending the halves of V1 or V2.
12467 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
12468 return DAG.getUNDEF(SplitVT);
12469 if (!UseLoV2 && !UseHiV2)
12470 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12471 if (!UseLoV1 && !UseHiV1)
12472 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12474 SDValue V1Blend, V2Blend;
12475 if (UseLoV1 && UseHiV1) {
12477 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12479 // We only use half of V1 so map the usage down into the final blend mask.
12480 V1Blend = UseLoV1 ? LoV1 : HiV1;
12481 for (int i = 0; i < SplitNumElements; ++i)
12482 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12483 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12485 if (UseLoV2 && UseHiV2) {
12487 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12489 // We only use half of V2 so map the usage down into the final blend mask.
12490 V2Blend = UseLoV2 ? LoV2 : HiV2;
12491 for (int i = 0; i < SplitNumElements; ++i)
12492 if (BlendMask[i] >= SplitNumElements)
12493 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12495 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12497 SDValue Lo = HalfBlend(LoMask);
12498 SDValue Hi = HalfBlend(HiMask);
12499 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12502 /// \brief Either split a vector in halves or decompose the shuffles and the
12505 /// This is provided as a good fallback for many lowerings of non-single-input
12506 /// shuffles with more than one 128-bit lane. In those cases, we want to select
12507 /// between splitting the shuffle into 128-bit components and stitching those
12508 /// back together vs. extracting the single-input shuffles and blending those
12510 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12511 SDValue V1, SDValue V2,
12512 ArrayRef<int> Mask,
12513 SelectionDAG &DAG) {
12514 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
12515 "shuffles as it could then recurse on itself.");
12516 int Size = Mask.size();
12518 // If this can be modeled as a broadcast of two elements followed by a blend,
12519 // prefer that lowering. This is especially important because broadcasts can
12520 // often fold with memory operands.
12521 auto DoBothBroadcast = [&] {
12522 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
12525 if (V2BroadcastIdx < 0)
12526 V2BroadcastIdx = M - Size;
12527 else if (M - Size != V2BroadcastIdx)
12529 } else if (M >= 0) {
12530 if (V1BroadcastIdx < 0)
12531 V1BroadcastIdx = M;
12532 else if (M != V1BroadcastIdx)
12537 if (DoBothBroadcast())
12538 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12541 // If the inputs all stem from a single 128-bit lane of each input, then we
12542 // split them rather than blending because the split will decompose to
12543 // unusually few instructions.
12544 int LaneCount = VT.getSizeInBits() / 128;
12545 int LaneSize = Size / LaneCount;
12546 SmallBitVector LaneInputs[2];
12547 LaneInputs[0].resize(LaneCount, false);
12548 LaneInputs[1].resize(LaneCount, false);
12549 for (int i = 0; i < Size; ++i)
12551 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12552 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12553 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12555 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12556 // that the decomposed single-input shuffles don't end up here.
12557 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12560 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12561 /// a permutation and blend of those lanes.
12563 /// This essentially blends the out-of-lane inputs to each lane into the lane
12564 /// from a permuted copy of the vector. This lowering strategy results in four
12565 /// instructions in the worst case for a single-input cross lane shuffle which
12566 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
12567 /// of. Special cases for each particular shuffle pattern should be handled
12568 /// prior to trying this lowering.
12569 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12570 SDValue V1, SDValue V2,
12571 ArrayRef<int> Mask,
12573 const X86Subtarget &Subtarget) {
12574 // FIXME: This should probably be generalized for 512-bit vectors as well.
12575 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
12576 int Size = Mask.size();
12577 int LaneSize = Size / 2;
12579 // If there are only inputs from one 128-bit lane, splitting will in fact be
12580 // less expensive. The flags track whether the given lane contains an element
12581 // that crosses to another lane.
12582 if (!Subtarget.hasAVX2()) {
12583 bool LaneCrossing[2] = {false, false};
12584 for (int i = 0; i < Size; ++i)
12585 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12586 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12587 if (!LaneCrossing[0] || !LaneCrossing[1])
12588 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12590 bool LaneUsed[2] = {false, false};
12591 for (int i = 0; i < Size; ++i)
12593 LaneUsed[(Mask[i] / LaneSize)] = true;
12594 if (!LaneUsed[0] || !LaneUsed[1])
12595 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12598 assert(V2.isUndef() &&
12599 "This last part of this routine only works on single input shuffles");
12601 SmallVector<int, 32> FlippedBlendMask(Size);
12602 for (int i = 0; i < Size; ++i)
12603 FlippedBlendMask[i] =
12604 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12606 : Mask[i] % LaneSize +
12607 (i / LaneSize) * LaneSize + Size);
12609 // Flip the vector, and blend the results which should now be in-lane.
12610 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
12611 SDValue Flipped = DAG.getBitcast(PVT, V1);
12612 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
12614 Flipped = DAG.getBitcast(VT, Flipped);
12615 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12618 /// \brief Handle lowering 2-lane 128-bit shuffles.
12619 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12620 SDValue V2, ArrayRef<int> Mask,
12621 const APInt &Zeroable,
12622 const X86Subtarget &Subtarget,
12623 SelectionDAG &DAG) {
12624 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
12625 if (Subtarget.hasAVX2() && V2.isUndef())
12628 SmallVector<int, 4> WidenedMask;
12629 if (!canWidenShuffleElements(Mask, WidenedMask))
12632 // TODO: If minimizing size and one of the inputs is a zero vector and the
12633 // the zero vector has only one use, we could use a VPERM2X128 to save the
12634 // instruction bytes needed to explicitly generate the zero vector.
12636 // Blends are faster and handle all the non-lane-crossing cases.
12637 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12638 Zeroable, Subtarget, DAG))
12641 bool IsLowZero = (Zeroable & 0x3) == 0x3;
12642 bool IsHighZero = (Zeroable & 0xc) == 0xc;
12644 // If either input operand is a zero vector, use VPERM2X128 because its mask
12645 // allows us to replace the zero input with an implicit zero.
12646 if (!IsLowZero && !IsHighZero) {
12647 // Check for patterns which can be matched with a single insert of a 128-bit
12649 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12650 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12652 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12653 // this will likely become vinsertf128 which can't fold a 256-bit memop.
12654 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12655 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12656 VT.getVectorNumElements() / 2);
12657 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12658 DAG.getIntPtrConstant(0, DL));
12659 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12660 OnlyUsesV1 ? V1 : V2,
12661 DAG.getIntPtrConstant(0, DL));
12662 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12666 // Try to use SHUF128 if possible.
12667 if (Subtarget.hasVLX()) {
12668 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
12669 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
12670 ((WidenedMask[1] % 2) << 1);
12671 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
12672 DAG.getConstant(PermMask, DL, MVT::i8));
12677 // Otherwise form a 128-bit permutation. After accounting for undefs,
12678 // convert the 64-bit shuffle mask selection values into 128-bit
12679 // selection bits by dividing the indexes by 2 and shifting into positions
12680 // defined by a vperm2*128 instruction's immediate control byte.
12682 // The immediate permute control byte looks like this:
12683 // [1:0] - select 128 bits from sources for low half of destination
12685 // [3] - zero low half of destination
12686 // [5:4] - select 128 bits from sources for high half of destination
12688 // [7] - zero high half of destination
12690 assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
12692 unsigned PermMask = 0;
12693 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
12694 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
12696 // Check the immediate mask and replace unused sources with undef.
12697 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
12698 V1 = DAG.getUNDEF(VT);
12699 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
12700 V2 = DAG.getUNDEF(VT);
12702 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12703 DAG.getConstant(PermMask, DL, MVT::i8));
12706 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12707 /// shuffling each lane.
12709 /// This will only succeed when the result of fixing the 128-bit lanes results
12710 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12711 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12712 /// the lane crosses early and then use simpler shuffles within each lane.
12714 /// FIXME: It might be worthwhile at some point to support this without
12715 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12716 /// in x86 only floating point has interesting non-repeating shuffles, and even
12717 /// those are still *marginally* more expensive.
12718 static SDValue lowerVectorShuffleByMerging128BitLanes(
12719 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12720 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12721 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12723 int Size = Mask.size();
12724 int LaneSize = 128 / VT.getScalarSizeInBits();
12725 int NumLanes = Size / LaneSize;
12726 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12728 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12729 // check whether the in-128-bit lane shuffles share a repeating pattern.
12730 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12731 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12732 for (int i = 0; i < Size; ++i) {
12736 int j = i / LaneSize;
12738 if (Lanes[j] < 0) {
12739 // First entry we've seen for this lane.
12740 Lanes[j] = Mask[i] / LaneSize;
12741 } else if (Lanes[j] != Mask[i] / LaneSize) {
12742 // This doesn't match the lane selected previously!
12746 // Check that within each lane we have a consistent shuffle mask.
12747 int k = i % LaneSize;
12748 if (InLaneMask[k] < 0) {
12749 InLaneMask[k] = Mask[i] % LaneSize;
12750 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12751 // This doesn't fit a repeating in-lane mask.
12756 // First shuffle the lanes into place.
12757 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12758 VT.getSizeInBits() / 64);
12759 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12760 for (int i = 0; i < NumLanes; ++i)
12761 if (Lanes[i] >= 0) {
12762 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12763 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12766 V1 = DAG.getBitcast(LaneVT, V1);
12767 V2 = DAG.getBitcast(LaneVT, V2);
12768 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12770 // Cast it back to the type we actually want.
12771 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12773 // Now do a simple shuffle that isn't lane crossing.
12774 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12775 for (int i = 0; i < Size; ++i)
12777 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12778 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12779 "Must not introduce lane crosses at this point!");
12781 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12784 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
12785 /// This allows for fast cases such as subvector extraction/insertion
12786 /// or shuffling smaller vector types which can lower more efficiently.
12787 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12788 SDValue V1, SDValue V2,
12789 ArrayRef<int> Mask,
12790 const X86Subtarget &Subtarget,
12791 SelectionDAG &DAG) {
12792 assert((VT.is256BitVector() || VT.is512BitVector()) &&
12793 "Expected 256-bit or 512-bit vector");
12795 unsigned NumElts = VT.getVectorNumElements();
12796 unsigned HalfNumElts = NumElts / 2;
12797 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12799 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12800 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12801 if (!UndefLower && !UndefUpper)
12804 // Upper half is undef and lower half is whole upper subvector.
12805 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12807 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12808 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12809 DAG.getIntPtrConstant(HalfNumElts, DL));
12810 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12811 DAG.getIntPtrConstant(0, DL));
12814 // Lower half is undef and upper half is whole lower subvector.
12815 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12817 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12818 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12819 DAG.getIntPtrConstant(0, DL));
12820 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12821 DAG.getIntPtrConstant(HalfNumElts, DL));
12824 // If the shuffle only uses two of the four halves of the input operands,
12825 // then extract them and perform the 'half' shuffle at half width.
12826 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12827 int HalfIdx1 = -1, HalfIdx2 = -1;
12828 SmallVector<int, 8> HalfMask(HalfNumElts);
12829 unsigned Offset = UndefLower ? HalfNumElts : 0;
12830 for (unsigned i = 0; i != HalfNumElts; ++i) {
12831 int M = Mask[i + Offset];
12837 // Determine which of the 4 half vectors this element is from.
12838 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12839 int HalfIdx = M / HalfNumElts;
12841 // Determine the element index into its half vector source.
12842 int HalfElt = M % HalfNumElts;
12844 // We can shuffle with up to 2 half vectors, set the new 'half'
12845 // shuffle mask accordingly.
12846 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12847 HalfMask[i] = HalfElt;
12848 HalfIdx1 = HalfIdx;
12851 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12852 HalfMask[i] = HalfElt + HalfNumElts;
12853 HalfIdx2 = HalfIdx;
12857 // Too many half vectors referenced.
12860 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12862 // Only shuffle the halves of the inputs when useful.
12863 int NumLowerHalves =
12864 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12865 int NumUpperHalves =
12866 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12868 // uuuuXXXX - don't extract uppers just to insert again.
12869 if (UndefLower && NumUpperHalves != 0)
12872 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12873 if (UndefUpper && NumUpperHalves == 2)
12876 // AVX2 - XXXXuuuu - always extract lowers.
12877 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12878 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12879 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12881 // AVX2 supports variable 32-bit element cross-lane shuffles.
12882 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12883 // XXXXuuuu - don't extract lowers and uppers.
12884 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12889 // AVX512 - XXXXuuuu - always extract lowers.
12890 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
12893 auto GetHalfVector = [&](int HalfIdx) {
12895 return DAG.getUNDEF(HalfVT);
12896 SDValue V = (HalfIdx < 2 ? V1 : V2);
12897 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12898 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12899 DAG.getIntPtrConstant(HalfIdx, DL));
12902 SDValue Half1 = GetHalfVector(HalfIdx1);
12903 SDValue Half2 = GetHalfVector(HalfIdx2);
12904 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12905 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12906 DAG.getIntPtrConstant(Offset, DL));
12909 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12912 /// This returns true if the elements from a particular input are already in the
12913 /// slot required by the given mask and require no permutation.
12914 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12915 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12916 int Size = Mask.size();
12917 for (int i = 0; i < Size; ++i)
12918 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12924 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12925 /// every lane can be represented as the same repeating mask - allowing us to
12926 /// shuffle the sources with the repeating shuffle and then permute the result
12927 /// to the destination lanes.
12928 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12929 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12930 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12931 int NumElts = VT.getVectorNumElements();
12932 int NumLanes = VT.getSizeInBits() / 128;
12933 int NumLaneElts = NumElts / NumLanes;
12935 // On AVX2 we may be able to just shuffle the lowest elements and then
12936 // broadcast the result.
12937 if (Subtarget.hasAVX2()) {
12938 for (unsigned BroadcastSize : {16, 32, 64}) {
12939 if (BroadcastSize <= VT.getScalarSizeInBits())
12941 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12943 // Attempt to match a repeating pattern every NumBroadcastElts,
12944 // accounting for UNDEFs but only references the lowest 128-bit
12945 // lane of the inputs.
12946 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12947 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12948 for (int j = 0; j != NumBroadcastElts; ++j) {
12949 int M = Mask[i + j];
12952 int &R = RepeatMask[j];
12953 if (0 != ((M % NumElts) / NumLaneElts))
12955 if (0 <= R && R != M)
12962 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12963 if (!FindRepeatingBroadcastMask(RepeatMask))
12966 // Shuffle the (lowest) repeated elements in place for broadcast.
12967 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12969 // Shuffle the actual broadcast.
12970 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12971 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12972 for (int j = 0; j != NumBroadcastElts; ++j)
12973 BroadcastMask[i + j] = j;
12974 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12979 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12980 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12983 // Bail if we already have a repeated lane shuffle mask.
12984 SmallVector<int, 8> RepeatedShuffleMask;
12985 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12988 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12989 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12990 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12991 int NumSubLanes = NumLanes * SubLaneScale;
12992 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12994 // Check that all the sources are coming from the same lane and see if we can
12995 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12996 // determine the source sub-lane for each destination sub-lane.
12997 int TopSrcSubLane = -1;
12998 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12999 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
13000 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
13001 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
13003 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
13004 // Extract the sub-lane mask, check that it all comes from the same lane
13005 // and normalize the mask entries to come from the first lane.
13007 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
13008 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13009 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
13012 int Lane = (M % NumElts) / NumLaneElts;
13013 if ((0 <= SrcLane) && (SrcLane != Lane))
13016 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
13017 SubLaneMask[Elt] = LocalM;
13020 // Whole sub-lane is UNDEF.
13024 // Attempt to match against the candidate repeated sub-lane masks.
13025 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
13026 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
13027 for (int i = 0; i != NumSubLaneElts; ++i) {
13028 if (M1[i] < 0 || M2[i] < 0)
13030 if (M1[i] != M2[i])
13036 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
13037 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
13040 // Merge the sub-lane mask into the matching repeated sub-lane mask.
13041 for (int i = 0; i != NumSubLaneElts; ++i) {
13042 int M = SubLaneMask[i];
13045 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
13046 "Unexpected mask element");
13047 RepeatedSubLaneMask[i] = M;
13050 // Track the top most source sub-lane - by setting the remaining to UNDEF
13051 // we can greatly simplify shuffle matching.
13052 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
13053 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
13054 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
13058 // Bail if we failed to find a matching repeated sub-lane mask.
13059 if (Dst2SrcSubLanes[DstSubLane] < 0)
13062 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
13063 "Unexpected source lane");
13065 // Create a repeating shuffle mask for the entire vector.
13066 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
13067 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
13068 int Lane = SubLane / SubLaneScale;
13069 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
13070 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13071 int M = RepeatedSubLaneMask[Elt];
13074 int Idx = (SubLane * NumSubLaneElts) + Elt;
13075 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
13078 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
13080 // Shuffle each source sub-lane to its destination.
13081 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
13082 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
13083 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
13084 if (SrcSubLane < 0)
13086 for (int j = 0; j != NumSubLaneElts; ++j)
13087 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
13090 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
13094 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
13095 unsigned &ShuffleImm,
13096 ArrayRef<int> Mask) {
13097 int NumElts = VT.getVectorNumElements();
13098 assert(VT.getScalarSizeInBits() == 64 &&
13099 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
13100 "Unexpected data type for VSHUFPD");
13102 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
13103 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
13105 bool ShufpdMask = true;
13106 bool CommutableMask = true;
13107 for (int i = 0; i < NumElts; ++i) {
13108 if (Mask[i] == SM_SentinelUndef)
13112 int Val = (i & 6) + NumElts * (i & 1);
13113 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
13114 if (Mask[i] < Val || Mask[i] > Val + 1)
13115 ShufpdMask = false;
13116 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
13117 CommutableMask = false;
13118 ShuffleImm |= (Mask[i] % 2) << i;
13123 if (CommutableMask) {
13131 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
13132 ArrayRef<int> Mask, SDValue V1,
13133 SDValue V2, SelectionDAG &DAG) {
13134 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
13135 "Unexpected data type for VSHUFPD");
13137 unsigned Immediate = 0;
13138 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
13141 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13142 DAG.getConstant(Immediate, DL, MVT::i8));
13145 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
13147 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
13148 /// isn't available.
13149 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13150 const APInt &Zeroable,
13151 SDValue V1, SDValue V2,
13152 const X86Subtarget &Subtarget,
13153 SelectionDAG &DAG) {
13154 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13155 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13156 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13158 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
13159 Zeroable, Subtarget, DAG))
13162 if (V2.isUndef()) {
13163 // Check for being able to broadcast a single element.
13164 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13165 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13168 // Use low duplicate instructions for masks that match their pattern.
13169 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13170 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
13172 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
13173 // Non-half-crossing single input shuffles can be lowered with an
13174 // interleaved permutation.
13175 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13176 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
13177 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
13178 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13181 // With AVX2 we have direct support for this permutation.
13182 if (Subtarget.hasAVX2())
13183 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
13184 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13186 // Try to create an in-lane repeating shuffle mask and then shuffle the
13187 // the results into the target lanes.
13188 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13189 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13192 // Otherwise, fall back.
13193 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
13197 // Use dedicated unpack instructions for masks that match their pattern.
13199 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
13202 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
13203 Zeroable, Subtarget, DAG))
13206 // Check if the blend happens to exactly fit that of SHUFPD.
13208 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
13211 // Try to create an in-lane repeating shuffle mask and then shuffle the
13212 // the results into the target lanes.
13213 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13214 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13217 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13218 // shuffle. However, if we have AVX2 and either inputs are already in place,
13219 // we will be able to shuffle even across lanes the other input in a single
13220 // instruction so skip this pattern.
13221 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
13222 isShuffleMaskInputInPlace(1, Mask))))
13223 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13224 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13226 // If we have VLX support, we can use VEXPAND.
13227 if (Subtarget.hasVLX())
13228 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
13229 V1, V2, DAG, Subtarget))
13232 // If we have AVX2 then we always want to lower with a blend because an v4 we
13233 // can fully permute the elements.
13234 if (Subtarget.hasAVX2())
13235 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
13238 // Otherwise fall back on generic lowering.
13239 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
13242 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
13244 /// This routine is only called when we have AVX2 and thus a reasonable
13245 /// instruction set for v4i64 shuffling..
13246 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13247 const APInt &Zeroable,
13248 SDValue V1, SDValue V2,
13249 const X86Subtarget &Subtarget,
13250 SelectionDAG &DAG) {
13251 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13252 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13253 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13254 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
13256 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
13257 Zeroable, Subtarget, DAG))
13260 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
13261 Zeroable, Subtarget, DAG))
13264 // Check for being able to broadcast a single element.
13265 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
13266 Mask, Subtarget, DAG))
13269 if (V2.isUndef()) {
13270 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13271 // can use lower latency instructions that will operate on both lanes.
13272 SmallVector<int, 2> RepeatedMask;
13273 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
13274 SmallVector<int, 4> PSHUFDMask;
13275 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
13276 return DAG.getBitcast(
13278 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
13279 DAG.getBitcast(MVT::v8i32, V1),
13280 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13283 // AVX2 provides a direct instruction for permuting a single input across
13285 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
13286 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13289 // Try to use shift instructions.
13290 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
13291 Zeroable, Subtarget, DAG))
13294 // If we have VLX support, we can use VALIGN or VEXPAND.
13295 if (Subtarget.hasVLX()) {
13296 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
13297 Mask, Subtarget, DAG))
13300 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
13301 V1, V2, DAG, Subtarget))
13305 // Try to use PALIGNR.
13306 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
13307 Mask, Subtarget, DAG))
13310 // Use dedicated unpack instructions for masks that match their pattern.
13312 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
13315 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13316 // shuffle. However, if we have AVX2 and either inputs are already in place,
13317 // we will be able to shuffle even across lanes the other input in a single
13318 // instruction so skip this pattern.
13319 if (!isShuffleMaskInputInPlace(0, Mask) &&
13320 !isShuffleMaskInputInPlace(1, Mask))
13321 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13322 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13325 // Otherwise fall back on generic blend lowering.
13326 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
13330 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
13332 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
13333 /// isn't available.
13334 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13335 const APInt &Zeroable,
13336 SDValue V1, SDValue V2,
13337 const X86Subtarget &Subtarget,
13338 SelectionDAG &DAG) {
13339 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13340 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13341 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13343 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
13344 Zeroable, Subtarget, DAG))
13347 // Check for being able to broadcast a single element.
13348 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
13349 Mask, Subtarget, DAG))
13352 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13353 // options to efficiently lower the shuffle.
13354 SmallVector<int, 4> RepeatedMask;
13355 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
13356 assert(RepeatedMask.size() == 4 &&
13357 "Repeated masks must be half the mask width!");
13359 // Use even/odd duplicate instructions for masks that match their pattern.
13360 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13361 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
13362 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13363 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
13366 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
13367 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13369 // Use dedicated unpack instructions for masks that match their pattern.
13371 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
13374 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
13375 // have already handled any direct blends.
13376 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
13379 // Try to create an in-lane repeating shuffle mask and then shuffle the
13380 // the results into the target lanes.
13381 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13382 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13385 // If we have a single input shuffle with different shuffle patterns in the
13386 // two 128-bit lanes use the variable mask to VPERMILPS.
13387 if (V2.isUndef()) {
13388 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13389 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
13390 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
13392 if (Subtarget.hasAVX2())
13393 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
13395 // Otherwise, fall back.
13396 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
13400 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13402 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13403 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13405 // If we have VLX support, we can use VEXPAND.
13406 if (Subtarget.hasVLX())
13407 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
13408 V1, V2, DAG, Subtarget))
13411 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13412 // since after split we get a more efficient code using vpunpcklwd and
13413 // vpunpckhwd instrs than vblend.
13414 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
13415 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
13419 // If we have AVX2 then we always want to lower with a blend because at v8 we
13420 // can fully permute the elements.
13421 if (Subtarget.hasAVX2())
13422 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
13425 // Otherwise fall back on generic lowering.
13426 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
13429 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
13431 /// This routine is only called when we have AVX2 and thus a reasonable
13432 /// instruction set for v8i32 shuffling..
13433 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13434 const APInt &Zeroable,
13435 SDValue V1, SDValue V2,
13436 const X86Subtarget &Subtarget,
13437 SelectionDAG &DAG) {
13438 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13439 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13440 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13441 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
13443 // Whenever we can lower this as a zext, that instruction is strictly faster
13444 // than any alternative. It also allows us to fold memory operands into the
13445 // shuffle in many cases.
13446 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13447 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13450 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13451 // since after split we get a more efficient code than vblend by using
13452 // vpunpcklwd and vpunpckhwd instrs.
13453 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
13454 !Subtarget.hasAVX512())
13456 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
13459 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
13460 Zeroable, Subtarget, DAG))
13463 // Check for being able to broadcast a single element.
13464 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
13465 Mask, Subtarget, DAG))
13468 // If the shuffle mask is repeated in each 128-bit lane we can use more
13469 // efficient instructions that mirror the shuffles across the two 128-bit
13471 SmallVector<int, 4> RepeatedMask;
13472 bool Is128BitLaneRepeatedShuffle =
13473 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
13474 if (Is128BitLaneRepeatedShuffle) {
13475 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13477 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
13478 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13480 // Use dedicated unpack instructions for masks that match their pattern.
13482 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13486 // Try to use shift instructions.
13487 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13488 Zeroable, Subtarget, DAG))
13491 // If we have VLX support, we can use VALIGN or EXPAND.
13492 if (Subtarget.hasVLX()) {
13493 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13494 Mask, Subtarget, DAG))
13497 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13498 V1, V2, DAG, Subtarget))
13502 // Try to use byte rotation instructions.
13503 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13504 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13507 // Try to create an in-lane repeating shuffle mask and then shuffle the
13508 // results into the target lanes.
13509 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13510 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13513 // If the shuffle patterns aren't repeated but it is a single input, directly
13514 // generate a cross-lane VPERMD instruction.
13515 if (V2.isUndef()) {
13516 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13517 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
13520 // Assume that a single SHUFPS is faster than an alternative sequence of
13521 // multiple instructions (even if the CPU has a domain penalty).
13522 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13523 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13524 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
13525 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
13526 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13527 CastV1, CastV2, DAG);
13528 return DAG.getBitcast(MVT::v8i32, ShufPS);
13531 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13533 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13534 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13537 // Otherwise fall back on generic blend lowering.
13538 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13542 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13544 /// This routine is only called when we have AVX2 and thus a reasonable
13545 /// instruction set for v16i16 shuffling..
13546 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13547 const APInt &Zeroable,
13548 SDValue V1, SDValue V2,
13549 const X86Subtarget &Subtarget,
13550 SelectionDAG &DAG) {
13551 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13552 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13553 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13554 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
13556 // Whenever we can lower this as a zext, that instruction is strictly faster
13557 // than any alternative. It also allows us to fold memory operands into the
13558 // shuffle in many cases.
13559 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13560 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13563 // Check for being able to broadcast a single element.
13564 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13565 Mask, Subtarget, DAG))
13568 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13569 Zeroable, Subtarget, DAG))
13572 // Use dedicated unpack instructions for masks that match their pattern.
13574 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13577 // Use dedicated pack instructions for masks that match their pattern.
13578 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
13582 // Try to use shift instructions.
13583 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13584 Zeroable, Subtarget, DAG))
13587 // Try to use byte rotation instructions.
13588 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13589 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13592 // Try to create an in-lane repeating shuffle mask and then shuffle the
13593 // the results into the target lanes.
13594 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13595 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13598 if (V2.isUndef()) {
13599 // There are no generalized cross-lane shuffle operations available on i16
13601 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13602 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13603 Mask, DAG, Subtarget);
13605 SmallVector<int, 8> RepeatedMask;
13606 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13607 // As this is a single-input shuffle, the repeated mask should be
13608 // a strictly valid v8i16 mask that we can pass through to the v8i16
13609 // lowering to handle even the v16 case.
13610 return lowerV8I16GeneralSingleInputVectorShuffle(
13611 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13615 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13616 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13619 // AVX512BWVL can lower to VPERMW.
13620 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13621 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13623 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13625 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13626 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13629 // Otherwise fall back on generic lowering.
13630 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13633 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13635 /// This routine is only called when we have AVX2 and thus a reasonable
13636 /// instruction set for v32i8 shuffling..
13637 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13638 const APInt &Zeroable,
13639 SDValue V1, SDValue V2,
13640 const X86Subtarget &Subtarget,
13641 SelectionDAG &DAG) {
13642 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13643 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13644 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13645 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13647 // Whenever we can lower this as a zext, that instruction is strictly faster
13648 // than any alternative. It also allows us to fold memory operands into the
13649 // shuffle in many cases.
13650 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13651 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13654 // Check for being able to broadcast a single element.
13655 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13656 Mask, Subtarget, DAG))
13659 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13660 Zeroable, Subtarget, DAG))
13663 // Use dedicated unpack instructions for masks that match their pattern.
13665 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13668 // Use dedicated pack instructions for masks that match their pattern.
13669 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
13673 // Try to use shift instructions.
13674 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13675 Zeroable, Subtarget, DAG))
13678 // Try to use byte rotation instructions.
13679 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13680 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13683 // Try to create an in-lane repeating shuffle mask and then shuffle the
13684 // the results into the target lanes.
13685 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13686 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13689 // There are no generalized cross-lane shuffle operations available on i8
13691 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13692 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13695 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13696 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13699 // AVX512VBMIVL can lower to VPERMB.
13700 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
13701 return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
13703 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13705 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13706 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13709 // Otherwise fall back on generic lowering.
13710 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13713 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13715 /// This routine either breaks down the specific type of a 256-bit x86 vector
13716 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13717 /// together based on the available instructions.
13718 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13719 MVT VT, SDValue V1, SDValue V2,
13720 const APInt &Zeroable,
13721 const X86Subtarget &Subtarget,
13722 SelectionDAG &DAG) {
13723 // If we have a single input to the zero element, insert that into V1 if we
13724 // can do so cheaply.
13725 int NumElts = VT.getVectorNumElements();
13726 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13728 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13729 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13730 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13733 // Handle special cases where the lower or upper half is UNDEF.
13735 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13738 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13739 // can check for those subtargets here and avoid much of the subtarget
13740 // querying in the per-vector-type lowering routines. With AVX1 we have
13741 // essentially *zero* ability to manipulate a 256-bit vector with integer
13742 // types. Since we'll use floating point types there eventually, just
13743 // immediately cast everything to a float and operate entirely in that domain.
13744 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13745 int ElementBits = VT.getScalarSizeInBits();
13746 if (ElementBits < 32) {
13747 // No floating point type available, if we can't use the bit operations
13748 // for masking/blending then decompose into 128-bit vectors.
13750 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13752 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13754 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13757 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13758 VT.getVectorNumElements());
13759 V1 = DAG.getBitcast(FpVT, V1);
13760 V2 = DAG.getBitcast(FpVT, V2);
13761 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13764 switch (VT.SimpleTy) {
13766 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13768 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13770 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13772 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13774 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13776 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13779 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13783 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13784 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13785 ArrayRef<int> Mask, SDValue V1,
13786 SDValue V2, SelectionDAG &DAG) {
13787 assert(VT.getScalarSizeInBits() == 64 &&
13788 "Unexpected element type size for 128bit shuffle.");
13790 // To handle 256 bit vector requires VLX and most probably
13791 // function lowerV2X128VectorShuffle() is better solution.
13792 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13794 SmallVector<int, 4> WidenedMask;
13795 if (!canWidenShuffleElements(Mask, WidenedMask))
13798 // Check for patterns which can be matched with a single insert of a 256-bit
13800 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13801 {0, 1, 2, 3, 0, 1, 2, 3});
13802 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13803 {0, 1, 2, 3, 8, 9, 10, 11})) {
13804 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13805 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13806 DAG.getIntPtrConstant(0, DL));
13807 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13808 OnlyUsesV1 ? V1 : V2,
13809 DAG.getIntPtrConstant(0, DL));
13810 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13813 assert(WidenedMask.size() == 4);
13815 // See if this is an insertion of the lower 128-bits of V2 into V1.
13816 bool IsInsert = true;
13818 for (int i = 0; i < 4; ++i) {
13819 assert(WidenedMask[i] >= -1);
13820 if (WidenedMask[i] < 0)
13823 // Make sure all V1 subvectors are in place.
13824 if (WidenedMask[i] < 4) {
13825 if (WidenedMask[i] != i) {
13830 // Make sure we only have a single V2 index and its the lowest 128-bits.
13831 if (V2Index >= 0 || WidenedMask[i] != 4) {
13838 if (IsInsert && V2Index >= 0) {
13839 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13840 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13841 DAG.getIntPtrConstant(0, DL));
13842 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13845 // Try to lower to to vshuf64x2/vshuf32x4.
13846 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13847 unsigned PermMask = 0;
13848 // Insure elements came from the same Op.
13849 for (int i = 0; i < 4; ++i) {
13850 assert(WidenedMask[i] >= -1);
13851 if (WidenedMask[i] < 0)
13854 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13855 unsigned OpIndex = i / 2;
13856 if (Ops[OpIndex].isUndef())
13858 else if (Ops[OpIndex] != Op)
13861 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13862 // bits defined by a vshuf64x2 instruction's immediate control byte.
13863 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13866 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13867 DAG.getConstant(PermMask, DL, MVT::i8));
13870 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13871 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13872 const APInt &Zeroable,
13873 SDValue V1, SDValue V2,
13874 const X86Subtarget &Subtarget,
13875 SelectionDAG &DAG) {
13876 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13877 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13878 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13880 if (V2.isUndef()) {
13881 // Use low duplicate instructions for masks that match their pattern.
13882 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13883 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13885 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13886 // Non-half-crossing single input shuffles can be lowered with an
13887 // interleaved permutation.
13888 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13889 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13890 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13891 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13892 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13893 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13896 SmallVector<int, 4> RepeatedMask;
13897 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13898 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13899 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13902 if (SDValue Shuf128 =
13903 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13906 if (SDValue Unpck =
13907 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13910 // Check if the blend happens to exactly fit that of SHUFPD.
13912 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13915 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13916 V2, DAG, Subtarget))
13919 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13920 Zeroable, Subtarget, DAG))
13923 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13926 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13927 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13928 const APInt &Zeroable,
13929 SDValue V1, SDValue V2,
13930 const X86Subtarget &Subtarget,
13931 SelectionDAG &DAG) {
13932 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13933 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13934 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13936 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13937 // options to efficiently lower the shuffle.
13938 SmallVector<int, 4> RepeatedMask;
13939 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13940 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13942 // Use even/odd duplicate instructions for masks that match their pattern.
13943 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13944 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13945 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13946 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13949 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13950 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13952 // Use dedicated unpack instructions for masks that match their pattern.
13953 if (SDValue Unpck =
13954 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13957 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13958 Zeroable, Subtarget, DAG))
13961 // Otherwise, fall back to a SHUFPS sequence.
13962 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13965 // If we have a single input shuffle with different shuffle patterns in the
13966 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
13967 if (V2.isUndef() &&
13968 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
13969 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
13970 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
13973 // If we have AVX512F support, we can use VEXPAND.
13974 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13975 V1, V2, DAG, Subtarget))
13978 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13981 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13982 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13983 const APInt &Zeroable,
13984 SDValue V1, SDValue V2,
13985 const X86Subtarget &Subtarget,
13986 SelectionDAG &DAG) {
13987 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13988 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13989 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13991 if (V2.isUndef()) {
13992 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13993 // can use lower latency instructions that will operate on all four
13995 SmallVector<int, 2> Repeated128Mask;
13996 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13997 SmallVector<int, 4> PSHUFDMask;
13998 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
13999 return DAG.getBitcast(
14001 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
14002 DAG.getBitcast(MVT::v16i32, V1),
14003 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14006 SmallVector<int, 4> Repeated256Mask;
14007 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
14008 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
14009 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
14012 if (SDValue Shuf128 =
14013 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
14016 // Try to use shift instructions.
14017 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
14018 Zeroable, Subtarget, DAG))
14021 // Try to use VALIGN.
14022 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
14023 Mask, Subtarget, DAG))
14026 // Try to use PALIGNR.
14027 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
14028 Mask, Subtarget, DAG))
14031 if (SDValue Unpck =
14032 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
14034 // If we have AVX512F support, we can use VEXPAND.
14035 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
14036 V2, DAG, Subtarget))
14039 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
14040 Zeroable, Subtarget, DAG))
14043 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
14046 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
14047 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14048 const APInt &Zeroable,
14049 SDValue V1, SDValue V2,
14050 const X86Subtarget &Subtarget,
14051 SelectionDAG &DAG) {
14052 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14053 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14054 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14056 // Whenever we can lower this as a zext, that instruction is strictly faster
14057 // than any alternative. It also allows us to fold memory operands into the
14058 // shuffle in many cases.
14059 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14060 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14063 // If the shuffle mask is repeated in each 128-bit lane we can use more
14064 // efficient instructions that mirror the shuffles across the four 128-bit
14066 SmallVector<int, 4> RepeatedMask;
14067 bool Is128BitLaneRepeatedShuffle =
14068 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
14069 if (Is128BitLaneRepeatedShuffle) {
14070 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14072 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
14073 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14075 // Use dedicated unpack instructions for masks that match their pattern.
14077 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
14081 // Try to use shift instructions.
14082 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
14083 Zeroable, Subtarget, DAG))
14086 // Try to use VALIGN.
14087 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
14088 Mask, Subtarget, DAG))
14091 // Try to use byte rotation instructions.
14092 if (Subtarget.hasBWI())
14093 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14094 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
14097 // Assume that a single SHUFPS is faster than using a permv shuffle.
14098 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14099 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14100 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
14101 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
14102 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
14103 CastV1, CastV2, DAG);
14104 return DAG.getBitcast(MVT::v16i32, ShufPS);
14106 // If we have AVX512F support, we can use VEXPAND.
14107 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
14108 V1, V2, DAG, Subtarget))
14111 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
14112 Zeroable, Subtarget, DAG))
14114 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
14117 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
14118 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14119 const APInt &Zeroable,
14120 SDValue V1, SDValue V2,
14121 const X86Subtarget &Subtarget,
14122 SelectionDAG &DAG) {
14123 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14124 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14125 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14126 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
14128 // Whenever we can lower this as a zext, that instruction is strictly faster
14129 // than any alternative. It also allows us to fold memory operands into the
14130 // shuffle in many cases.
14131 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14132 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14135 // Use dedicated unpack instructions for masks that match their pattern.
14137 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
14140 // Try to use shift instructions.
14141 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
14142 Zeroable, Subtarget, DAG))
14145 // Try to use byte rotation instructions.
14146 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14147 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
14150 if (V2.isUndef()) {
14151 SmallVector<int, 8> RepeatedMask;
14152 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
14153 // As this is a single-input shuffle, the repeated mask should be
14154 // a strictly valid v8i16 mask that we can pass through to the v8i16
14155 // lowering to handle even the v32 case.
14156 return lowerV8I16GeneralSingleInputVectorShuffle(
14157 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
14161 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
14162 Zeroable, Subtarget, DAG))
14165 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14166 DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14169 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
14172 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
14173 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14174 const APInt &Zeroable,
14175 SDValue V1, SDValue V2,
14176 const X86Subtarget &Subtarget,
14177 SelectionDAG &DAG) {
14178 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14179 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14180 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
14181 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
14183 // Whenever we can lower this as a zext, that instruction is strictly faster
14184 // than any alternative. It also allows us to fold memory operands into the
14185 // shuffle in many cases.
14186 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14187 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14190 // Use dedicated unpack instructions for masks that match their pattern.
14192 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
14195 // Try to use shift instructions.
14196 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
14197 Zeroable, Subtarget, DAG))
14200 // Try to use byte rotation instructions.
14201 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14202 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14205 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14206 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14209 // VBMI can use VPERMV/VPERMV3 byte shuffles.
14210 if (Subtarget.hasVBMI())
14211 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
14213 // Try to create an in-lane repeating shuffle mask and then shuffle the
14214 // the results into the target lanes.
14215 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14216 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14219 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
14220 Zeroable, Subtarget, DAG))
14223 // FIXME: Implement direct support for this type!
14224 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
14227 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
14229 /// This routine either breaks down the specific type of a 512-bit x86 vector
14230 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
14231 /// together based on the available instructions.
14232 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14233 MVT VT, SDValue V1, SDValue V2,
14234 const APInt &Zeroable,
14235 const X86Subtarget &Subtarget,
14236 SelectionDAG &DAG) {
14237 assert(Subtarget.hasAVX512() &&
14238 "Cannot lower 512-bit vectors w/ basic ISA!");
14240 // If we have a single input to the zero element, insert that into V1 if we
14241 // can do so cheaply.
14242 int NumElts = Mask.size();
14243 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14245 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14246 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14247 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14250 // Handle special cases where the lower or upper half is UNDEF.
14252 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14255 // Check for being able to broadcast a single element.
14256 if (SDValue Broadcast =
14257 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
14260 // Dispatch to each element type for lowering. If we don't have support for
14261 // specific element type shuffles at 512 bits, immediately split them and
14262 // lower them. Each lowering routine of a given type is allowed to assume that
14263 // the requisite ISA extensions for that element type are available.
14264 switch (VT.SimpleTy) {
14266 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14268 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14270 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14272 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14274 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14276 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14279 llvm_unreachable("Not a valid 512-bit x86 vector type!");
14283 // Lower vXi1 vector shuffles.
14284 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
14285 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
14286 // vector, shuffle and then truncate it back.
14287 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14288 MVT VT, SDValue V1, SDValue V2,
14289 const X86Subtarget &Subtarget,
14290 SelectionDAG &DAG) {
14291 assert(Subtarget.hasAVX512() &&
14292 "Cannot lower 512-bit vectors w/o basic ISA!");
14294 switch (VT.SimpleTy) {
14296 llvm_unreachable("Expected a vector of i1 elements");
14298 ExtVT = MVT::v2i64;
14301 ExtVT = MVT::v4i32;
14304 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
14306 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
14309 ExtVT = MVT::v16i32;
14312 ExtVT = MVT::v32i16;
14315 ExtVT = MVT::v64i8;
14319 if (ISD::isBuildVectorAllZeros(V1.getNode()))
14320 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
14321 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
14322 V1 = getOnesVector(ExtVT, DAG, DL);
14324 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
14327 V2 = DAG.getUNDEF(ExtVT);
14328 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
14329 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
14330 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
14331 V2 = getOnesVector(ExtVT, DAG, DL);
14333 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
14335 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
14336 // i1 was sign extended we can use X86ISD::CVT2MASK.
14337 int NumElems = VT.getVectorNumElements();
14338 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
14339 (Subtarget.hasDQI() && (NumElems < 32)))
14340 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
14342 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
14345 /// Helper function that returns true if the shuffle mask should be
14346 /// commuted to improve canonicalization.
14347 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
14348 int NumElements = Mask.size();
14350 int NumV1Elements = 0, NumV2Elements = 0;
14354 else if (M < NumElements)
14359 // Commute the shuffle as needed such that more elements come from V1 than
14360 // V2. This allows us to match the shuffle pattern strictly on how many
14361 // elements come from V1 without handling the symmetric cases.
14362 if (NumV2Elements > NumV1Elements)
14365 assert(NumV1Elements > 0 && "No V1 indices");
14367 if (NumV2Elements == 0)
14370 // When the number of V1 and V2 elements are the same, try to minimize the
14371 // number of uses of V2 in the low half of the vector. When that is tied,
14372 // ensure that the sum of indices for V1 is equal to or lower than the sum
14373 // indices for V2. When those are equal, try to ensure that the number of odd
14374 // indices for V1 is lower than the number of odd indices for V2.
14375 if (NumV1Elements == NumV2Elements) {
14376 int LowV1Elements = 0, LowV2Elements = 0;
14377 for (int M : Mask.slice(0, NumElements / 2))
14378 if (M >= NumElements)
14382 if (LowV2Elements > LowV1Elements)
14384 if (LowV2Elements == LowV1Elements) {
14385 int SumV1Indices = 0, SumV2Indices = 0;
14386 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14387 if (Mask[i] >= NumElements)
14389 else if (Mask[i] >= 0)
14391 if (SumV2Indices < SumV1Indices)
14393 if (SumV2Indices == SumV1Indices) {
14394 int NumV1OddIndices = 0, NumV2OddIndices = 0;
14395 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14396 if (Mask[i] >= NumElements)
14397 NumV2OddIndices += i % 2;
14398 else if (Mask[i] >= 0)
14399 NumV1OddIndices += i % 2;
14400 if (NumV2OddIndices < NumV1OddIndices)
14409 /// \brief Top-level lowering for x86 vector shuffles.
14411 /// This handles decomposition, canonicalization, and lowering of all x86
14412 /// vector shuffles. Most of the specific lowering strategies are encapsulated
14413 /// above in helper routines. The canonicalization attempts to widen shuffles
14414 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
14415 /// s.t. only one of the two inputs needs to be tested, etc.
14416 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
14417 SelectionDAG &DAG) {
14418 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
14419 ArrayRef<int> Mask = SVOp->getMask();
14420 SDValue V1 = Op.getOperand(0);
14421 SDValue V2 = Op.getOperand(1);
14422 MVT VT = Op.getSimpleValueType();
14423 int NumElements = VT.getVectorNumElements();
14425 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
14427 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
14428 "Can't lower MMX shuffles");
14430 bool V1IsUndef = V1.isUndef();
14431 bool V2IsUndef = V2.isUndef();
14432 if (V1IsUndef && V2IsUndef)
14433 return DAG.getUNDEF(VT);
14435 // When we create a shuffle node we put the UNDEF node to second operand,
14436 // but in some cases the first operand may be transformed to UNDEF.
14437 // In this case we should just commute the node.
14439 return DAG.getCommutedVectorShuffle(*SVOp);
14441 // Check for non-undef masks pointing at an undef vector and make the masks
14442 // undef as well. This makes it easier to match the shuffle based solely on
14446 if (M >= NumElements) {
14447 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
14448 for (int &M : NewMask)
14449 if (M >= NumElements)
14451 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14454 // Check for illegal shuffle mask element index values.
14455 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
14456 assert(llvm::all_of(Mask,
14457 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
14458 "Out of bounds shuffle index");
14460 // We actually see shuffles that are entirely re-arrangements of a set of
14461 // zero inputs. This mostly happens while decomposing complex shuffles into
14462 // simple ones. Directly lower these as a buildvector of zeros.
14463 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
14464 if (Zeroable.isAllOnesValue())
14465 return getZeroVector(VT, Subtarget, DAG, DL);
14467 // Try to collapse shuffles into using a vector type with fewer elements but
14468 // wider element types. We cap this to not form integers or floating point
14469 // elements wider than 64 bits, but it might be interesting to form i128
14470 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
14471 SmallVector<int, 16> WidenedMask;
14472 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
14473 canWidenShuffleElements(Mask, WidenedMask)) {
14474 MVT NewEltVT = VT.isFloatingPoint()
14475 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
14476 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
14477 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14478 // Make sure that the new vector type is legal. For example, v2f64 isn't
14480 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14481 V1 = DAG.getBitcast(NewVT, V1);
14482 V2 = DAG.getBitcast(NewVT, V2);
14483 return DAG.getBitcast(
14484 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
14488 // Commute the shuffle if it will improve canonicalization.
14489 if (canonicalizeShuffleMaskWithCommute(Mask))
14490 return DAG.getCommutedVectorShuffle(*SVOp);
14492 // For each vector width, delegate to a specialized lowering routine.
14493 if (VT.is128BitVector())
14494 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14497 if (VT.is256BitVector())
14498 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14501 if (VT.is512BitVector())
14502 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14506 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
14508 llvm_unreachable("Unimplemented!");
14511 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
14512 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
14513 const X86Subtarget &Subtarget,
14514 SelectionDAG &DAG) {
14515 SDValue Cond = Op.getOperand(0);
14516 SDValue LHS = Op.getOperand(1);
14517 SDValue RHS = Op.getOperand(2);
14519 MVT VT = Op.getSimpleValueType();
14521 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
14523 auto *CondBV = cast<BuildVectorSDNode>(Cond);
14525 // Only non-legal VSELECTs reach this lowering, convert those into generic
14526 // shuffles and re-use the shuffle lowering path for blends.
14527 SmallVector<int, 32> Mask;
14528 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
14529 SDValue CondElt = CondBV->getOperand(i);
14531 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
14534 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
14537 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
14538 // A vselect where all conditions and data are constants can be optimized into
14539 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
14540 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
14541 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
14542 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
14545 // Try to lower this to a blend-style vector shuffle. This can handle all
14546 // constant condition cases.
14547 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
14550 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
14551 // with patterns on the mask registers on AVX-512.
14552 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
14555 // Variable blends are only legal from SSE4.1 onward.
14556 if (!Subtarget.hasSSE41())
14560 MVT VT = Op.getSimpleValueType();
14562 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14563 // into an i1 condition so that we can use the mask-based 512-bit blend
14565 if (VT.getSizeInBits() == 512) {
14566 SDValue Cond = Op.getOperand(0);
14567 // The vNi1 condition case should be handled above as it can be trivially
14569 assert(Cond.getValueType().getScalarSizeInBits() ==
14570 VT.getScalarSizeInBits() &&
14571 "Should have a size-matched integer condition!");
14572 // Build a mask by testing the condition against itself (tests for zero).
14573 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14574 SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
14575 // Now return a new VSELECT using the mask.
14576 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14579 // Only some types will be legal on some subtargets. If we can emit a legal
14580 // VSELECT-matching blend, return Op, and but if we need to expand, return
14582 switch (VT.SimpleTy) {
14584 // Most of the vector types have blends past SSE4.1.
14588 // The byte blends for AVX vectors were introduced only in AVX2.
14589 if (Subtarget.hasAVX2())
14596 // FIXME: We should custom lower this by fixing the condition and using i8
14602 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14603 MVT VT = Op.getSimpleValueType();
14606 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14609 if (VT.getSizeInBits() == 8) {
14610 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14611 Op.getOperand(0), Op.getOperand(1));
14612 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
14615 if (VT == MVT::f32) {
14616 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14617 // the result back to FR32 register. It's only worth matching if the
14618 // result has a single use which is a store or a bitcast to i32. And in
14619 // the case of a store, it's not worth it if the index is a constant 0,
14620 // because a MOVSSmr can be used instead, which is smaller and faster.
14621 if (!Op.hasOneUse())
14623 SDNode *User = *Op.getNode()->use_begin();
14624 if ((User->getOpcode() != ISD::STORE ||
14625 isNullConstant(Op.getOperand(1))) &&
14626 (User->getOpcode() != ISD::BITCAST ||
14627 User->getValueType(0) != MVT::i32))
14629 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14630 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14632 return DAG.getBitcast(MVT::f32, Extract);
14635 if (VT == MVT::i32 || VT == MVT::i64) {
14636 // ExtractPS/pextrq works with constant index.
14637 if (isa<ConstantSDNode>(Op.getOperand(1)))
14644 /// Extract one bit from mask vector, like v16i1 or v8i1.
14645 /// AVX-512 feature.
14646 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
14647 const X86Subtarget &Subtarget) {
14648 SDValue Vec = Op.getOperand(0);
14650 MVT VecVT = Vec.getSimpleValueType();
14651 SDValue Idx = Op.getOperand(1);
14652 MVT EltVT = Op.getSimpleValueType();
14654 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
14655 "Unexpected vector type in ExtractBitFromMaskVector");
14657 // variable index can't be handled in mask registers,
14658 // extend vector to VR512/128
14659 if (!isa<ConstantSDNode>(Idx)) {
14660 unsigned NumElts = VecVT.getVectorNumElements();
14661 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14662 // than extending to 128/256bit.
14663 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
14664 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
14665 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
14666 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
14667 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14670 // Canonicalize result type to MVT::i32.
14671 if (EltVT != MVT::i32) {
14672 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14674 return DAG.getAnyExtOrTrunc(Extract, dl, EltVT);
14677 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14679 // Extracts from element 0 are always allowed.
14683 // If the kshift instructions of the correct width aren't natively supported
14684 // then we need to promote the vector to the native size to get the correct
14685 // zeroing behavior.
14686 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14687 (VecVT.getVectorNumElements() < 8)) {
14688 VecVT = MVT::v16i1;
14689 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14690 DAG.getUNDEF(VecVT),
14692 DAG.getIntPtrConstant(0, dl));
14695 // Use kshiftr instruction to move to the lower element.
14696 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14697 DAG.getConstant(IdxVal, dl, MVT::i8));
14698 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec,
14699 DAG.getIntPtrConstant(0, dl));
14703 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14704 SelectionDAG &DAG) const {
14706 SDValue Vec = Op.getOperand(0);
14707 MVT VecVT = Vec.getSimpleValueType();
14708 SDValue Idx = Op.getOperand(1);
14710 if (VecVT.getVectorElementType() == MVT::i1)
14711 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
14713 if (!isa<ConstantSDNode>(Idx)) {
14714 // Its more profitable to go through memory (1 cycles throughput)
14715 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14716 // IACA tool was used to get performance estimation
14717 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14719 // example : extractelement <16 x i8> %a, i32 %i
14721 // Block Throughput: 3.00 Cycles
14722 // Throughput Bottleneck: Port5
14724 // | Num Of | Ports pressure in cycles | |
14725 // | Uops | 0 - DV | 5 | 6 | 7 | |
14726 // ---------------------------------------------
14727 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14728 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14729 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14730 // Total Num Of Uops: 4
14733 // Block Throughput: 1.00 Cycles
14734 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14736 // | | Ports pressure in cycles | |
14737 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14738 // ---------------------------------------------------------
14739 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14740 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14741 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14742 // Total Num Of Uops: 4
14747 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14749 // If this is a 256-bit vector result, first extract the 128-bit vector and
14750 // then extract the element from the 128-bit vector.
14751 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14752 // Get the 128-bit vector.
14753 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14754 MVT EltVT = VecVT.getVectorElementType();
14756 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14757 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14759 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14760 // this can be done with a mask.
14761 IdxVal &= ElemsPerChunk - 1;
14762 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14763 DAG.getConstant(IdxVal, dl, MVT::i32));
14766 assert(VecVT.is128BitVector() && "Unexpected vector length");
14768 MVT VT = Op.getSimpleValueType();
14770 if (VT.getSizeInBits() == 16) {
14771 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14772 // we're going to zero extend the register or fold the store (SSE41 only).
14773 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14774 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14775 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14776 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14777 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14779 // Transform it so it match pextrw which produces a 32-bit result.
14780 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14781 Op.getOperand(0), Op.getOperand(1));
14782 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
14785 if (Subtarget.hasSSE41())
14786 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14789 // TODO: We only extract a single element from v16i8, we can probably afford
14790 // to be more aggressive here before using the default approach of spilling to
14792 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14793 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14794 int DWordIdx = IdxVal / 4;
14795 if (DWordIdx == 0) {
14796 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14797 DAG.getBitcast(MVT::v4i32, Vec),
14798 DAG.getIntPtrConstant(DWordIdx, dl));
14799 int ShiftVal = (IdxVal % 4) * 8;
14801 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14802 DAG.getConstant(ShiftVal, dl, MVT::i32));
14803 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14806 int WordIdx = IdxVal / 2;
14807 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14808 DAG.getBitcast(MVT::v8i16, Vec),
14809 DAG.getIntPtrConstant(WordIdx, dl));
14810 int ShiftVal = (IdxVal % 2) * 8;
14812 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14813 DAG.getConstant(ShiftVal, dl, MVT::i16));
14814 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14817 if (VT.getSizeInBits() == 32) {
14821 // SHUFPS the element to the lowest double word, then movss.
14822 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14823 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14824 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14825 DAG.getIntPtrConstant(0, dl));
14828 if (VT.getSizeInBits() == 64) {
14829 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14830 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14831 // to match extract_elt for f64.
14835 // UNPCKHPD the element to the lowest double word, then movsd.
14836 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14837 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14838 int Mask[2] = { 1, -1 };
14839 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14840 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14841 DAG.getIntPtrConstant(0, dl));
14847 /// Insert one bit to mask vector, like v16i1 or v8i1.
14848 /// AVX-512 feature.
14849 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
14850 const X86Subtarget &Subtarget) {
14852 SDValue Vec = Op.getOperand(0);
14853 SDValue Elt = Op.getOperand(1);
14854 SDValue Idx = Op.getOperand(2);
14855 MVT VecVT = Vec.getSimpleValueType();
14857 if (!isa<ConstantSDNode>(Idx)) {
14858 // Non constant index. Extend source and destination,
14859 // insert element and then truncate the result.
14860 unsigned NumElts = VecVT.getVectorNumElements();
14861 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
14862 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
14863 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14864 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
14865 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
14866 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14869 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14870 unsigned NumElems = VecVT.getVectorNumElements();
14872 // If the kshift instructions of the correct width aren't natively supported
14873 // then we need to promote the vector to the native size to get the correct
14874 // zeroing behavior.
14875 if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) {
14876 // Need to promote to v16i1, do the insert, then extract back.
14877 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
14878 DAG.getUNDEF(MVT::v16i1), Vec,
14879 DAG.getIntPtrConstant(0, dl));
14880 Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
14881 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
14882 DAG.getIntPtrConstant(0, dl));
14885 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14887 if (Vec.isUndef()) {
14889 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14890 DAG.getConstant(IdxVal, dl, MVT::i8));
14894 // Insertion of one bit into first position
14895 if (IdxVal == 0 ) {
14896 // Clean top bits of vector.
14897 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14898 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14899 EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14900 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14901 // Clean the first bit in source vector.
14902 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14903 DAG.getConstant(1 , dl, MVT::i8));
14904 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14905 DAG.getConstant(1, dl, MVT::i8));
14907 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14909 // Insertion of one bit into last position
14910 if (IdxVal == NumElems - 1) {
14911 // Move the bit to the last position inside the vector.
14912 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14913 DAG.getConstant(IdxVal, dl, MVT::i8));
14914 // Clean the last bit in the source vector.
14915 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14916 DAG.getConstant(1, dl, MVT::i8));
14917 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14918 DAG.getConstant(1 , dl, MVT::i8));
14920 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14923 // Move the current value of the bit to be replace to bit 0.
14924 SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14925 DAG.getConstant(IdxVal, dl, MVT::i8));
14926 // Xor with the new bit.
14927 Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);
14928 // Shift to MSB, filling bottom bits with 0.
14929 Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,
14930 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14931 // Shift to the final position, filling upper bits with 0.
14932 Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,
14933 DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));
14934 // Xor with original vector to cancel out the original bit value that's still
14936 return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);
14939 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14940 SelectionDAG &DAG) const {
14941 MVT VT = Op.getSimpleValueType();
14942 MVT EltVT = VT.getVectorElementType();
14943 unsigned NumElts = VT.getVectorNumElements();
14945 if (EltVT == MVT::i1)
14946 return InsertBitToMaskVector(Op, DAG, Subtarget);
14949 SDValue N0 = Op.getOperand(0);
14950 SDValue N1 = Op.getOperand(1);
14951 SDValue N2 = Op.getOperand(2);
14952 if (!isa<ConstantSDNode>(N2))
14954 auto *N2C = cast<ConstantSDNode>(N2);
14955 unsigned IdxVal = N2C->getZExtValue();
14957 bool IsZeroElt = X86::isZeroNode(N1);
14958 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14960 // If we are inserting a element, see if we can do this more efficiently with
14961 // a blend shuffle with a rematerializable vector than a costly integer
14963 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
14964 16 <= EltVT.getSizeInBits()) {
14965 SmallVector<int, 8> BlendMask;
14966 for (unsigned i = 0; i != NumElts; ++i)
14967 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14968 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14969 : getOnesVector(VT, DAG, dl);
14970 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14973 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14974 // into that, and then insert the subvector back into the result.
14975 if (VT.is256BitVector() || VT.is512BitVector()) {
14976 // With a 256-bit vector, we can insert into the zero element efficiently
14977 // using a blend if we have AVX or AVX2 and the right data type.
14978 if (VT.is256BitVector() && IdxVal == 0) {
14979 // TODO: It is worthwhile to cast integer to floating point and back
14980 // and incur a domain crossing penalty if that's what we'll end up
14981 // doing anyway after extracting to a 128-bit vector.
14982 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14983 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14984 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14985 N2 = DAG.getIntPtrConstant(1, dl);
14986 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14990 // Get the desired 128-bit vector chunk.
14991 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14993 // Insert the element into the desired chunk.
14994 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14995 assert(isPowerOf2_32(NumEltsIn128));
14996 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14997 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14999 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
15000 DAG.getConstant(IdxIn128, dl, MVT::i32));
15002 // Insert the changed part back into the bigger vector
15003 return insert128BitVector(N0, V, IdxVal, DAG, dl);
15005 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
15007 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
15008 // argument. SSE41 required for pinsrb.
15009 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
15011 if (VT == MVT::v8i16) {
15012 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
15013 Opc = X86ISD::PINSRW;
15015 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
15016 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
15017 Opc = X86ISD::PINSRB;
15020 if (N1.getValueType() != MVT::i32)
15021 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
15022 if (N2.getValueType() != MVT::i32)
15023 N2 = DAG.getIntPtrConstant(IdxVal, dl);
15024 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
15027 if (Subtarget.hasSSE41()) {
15028 if (EltVT == MVT::f32) {
15029 // Bits [7:6] of the constant are the source select. This will always be
15030 // zero here. The DAG Combiner may combine an extract_elt index into
15031 // these bits. For example (insert (extract, 3), 2) could be matched by
15032 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
15033 // Bits [5:4] of the constant are the destination select. This is the
15034 // value of the incoming immediate.
15035 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
15036 // combine either bitwise AND or insert of float 0.0 to set these bits.
15038 bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
15039 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
15040 // If this is an insertion of 32-bits into the low 32-bits of
15041 // a vector, we prefer to generate a blend with immediate rather
15042 // than an insertps. Blends are simpler operations in hardware and so
15043 // will always have equal or better performance than insertps.
15044 // But if optimizing for size and there's a load folding opportunity,
15045 // generate insertps because blendps does not have a 32-bit memory
15047 N2 = DAG.getIntPtrConstant(1, dl);
15048 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15049 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
15051 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
15052 // Create this as a scalar to vector..
15053 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15054 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
15057 // PINSR* works with constant index.
15058 if (EltVT == MVT::i32 || EltVT == MVT::i64)
15065 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
15066 SelectionDAG &DAG) {
15068 MVT OpVT = Op.getSimpleValueType();
15070 // It's always cheaper to replace a xor+movd with xorps and simplifies further
15072 if (X86::isZeroNode(Op.getOperand(0)))
15073 return getZeroVector(OpVT, Subtarget, DAG, dl);
15075 // If this is a 256-bit vector result, first insert into a 128-bit
15076 // vector and then insert into the 256-bit vector.
15077 if (!OpVT.is128BitVector()) {
15078 // Insert into a 128-bit vector.
15079 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
15080 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
15081 OpVT.getVectorNumElements() / SizeFactor);
15083 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
15085 // Insert the 128-bit vector.
15086 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
15088 assert(OpVT.is128BitVector() && "Expected an SSE type!");
15090 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
15091 if (OpVT == MVT::v4i32)
15094 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
15095 return DAG.getBitcast(
15096 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
15099 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
15100 // simple superregister reference or explicit instructions to insert
15101 // the upper bits of a vector.
15102 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15103 SelectionDAG &DAG) {
15104 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
15106 return insert1BitVector(Op, DAG, Subtarget);
15109 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15110 SelectionDAG &DAG) {
15111 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15112 "Only vXi1 extract_subvectors need custom lowering");
15115 SDValue Vec = Op.getOperand(0);
15116 SDValue Idx = Op.getOperand(1);
15118 if (!isa<ConstantSDNode>(Idx))
15121 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15122 if (IdxVal == 0) // the operation is legal
15125 MVT VecVT = Vec.getSimpleValueType();
15126 unsigned NumElems = VecVT.getVectorNumElements();
15128 // Extend to natively supported kshift.
15129 MVT WideVecVT = VecVT;
15130 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
15131 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
15132 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
15133 DAG.getUNDEF(WideVecVT), Vec,
15134 DAG.getIntPtrConstant(0, dl));
15137 // Shift to the LSB.
15138 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
15139 DAG.getConstant(IdxVal, dl, MVT::i8));
15141 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
15142 DAG.getIntPtrConstant(0, dl));
15145 // Returns the appropriate wrapper opcode for a global reference.
15146 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
15147 // References to absolute symbols are never PC-relative.
15148 if (GV && GV->isAbsoluteSymbolRef())
15149 return X86ISD::Wrapper;
15151 CodeModel::Model M = getTargetMachine().getCodeModel();
15152 if (Subtarget.isPICStyleRIPRel() &&
15153 (M == CodeModel::Small || M == CodeModel::Kernel))
15154 return X86ISD::WrapperRIP;
15156 return X86ISD::Wrapper;
15159 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
15160 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
15161 // one of the above mentioned nodes. It has to be wrapped because otherwise
15162 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
15163 // be used to form addressing mode. These wrapped nodes will be selected
15166 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
15167 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
15169 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15170 // global base reg.
15171 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15173 auto PtrVT = getPointerTy(DAG.getDataLayout());
15174 SDValue Result = DAG.getTargetConstantPool(
15175 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
15177 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15178 // With PIC, the address is actually $g + Offset.
15181 DAG.getNode(ISD::ADD, DL, PtrVT,
15182 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15188 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
15189 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
15191 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15192 // global base reg.
15193 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15195 auto PtrVT = getPointerTy(DAG.getDataLayout());
15196 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
15198 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15200 // With PIC, the address is actually $g + Offset.
15203 DAG.getNode(ISD::ADD, DL, PtrVT,
15204 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15210 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
15211 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
15213 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15214 // global base reg.
15215 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
15216 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
15218 auto PtrVT = getPointerTy(DAG.getDataLayout());
15219 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
15222 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15224 // With PIC, the address is actually $g + Offset.
15225 if (isPositionIndependent() && !Subtarget.is64Bit()) {
15227 DAG.getNode(ISD::ADD, DL, PtrVT,
15228 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15231 // For symbols that require a load from a stub to get the address, emit the
15233 if (isGlobalStubReference(OpFlag))
15234 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
15235 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15241 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
15242 // Create the TargetBlockAddressAddress node.
15243 unsigned char OpFlags =
15244 Subtarget.classifyBlockAddressReference();
15245 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
15246 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
15248 auto PtrVT = getPointerTy(DAG.getDataLayout());
15249 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
15250 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
15252 // With PIC, the address is actually $g + Offset.
15253 if (isGlobalRelativeToPICBase(OpFlags)) {
15254 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15255 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15261 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
15262 const SDLoc &dl, int64_t Offset,
15263 SelectionDAG &DAG) const {
15264 // Create the TargetGlobalAddress node, folding in the constant
15265 // offset if it is legal.
15266 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
15267 CodeModel::Model M = DAG.getTarget().getCodeModel();
15268 auto PtrVT = getPointerTy(DAG.getDataLayout());
15270 if (OpFlags == X86II::MO_NO_FLAG &&
15271 X86::isOffsetSuitableForCodeModel(Offset, M)) {
15272 // A direct static reference to a global.
15273 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
15276 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
15279 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
15281 // With PIC, the address is actually $g + Offset.
15282 if (isGlobalRelativeToPICBase(OpFlags)) {
15283 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15284 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15287 // For globals that require a load from a stub to get the address, emit the
15289 if (isGlobalStubReference(OpFlags))
15290 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
15291 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15293 // If there was a non-zero offset that we didn't fold, create an explicit
15294 // addition for it.
15296 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
15297 DAG.getConstant(Offset, dl, PtrVT));
15303 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
15304 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
15305 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
15306 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
15310 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
15311 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
15312 unsigned char OperandFlags, bool LocalDynamic = false) {
15313 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15314 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15316 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15317 GA->getValueType(0),
15321 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
15325 SDValue Ops[] = { Chain, TGA, *InFlag };
15326 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15328 SDValue Ops[] = { Chain, TGA };
15329 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15332 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
15333 MFI.setAdjustsStack(true);
15334 MFI.setHasCalls(true);
15336 SDValue Flag = Chain.getValue(1);
15337 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
15340 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
15342 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15345 SDLoc dl(GA); // ? function entry point might be better
15346 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15347 DAG.getNode(X86ISD::GlobalBaseReg,
15348 SDLoc(), PtrVT), InFlag);
15349 InFlag = Chain.getValue(1);
15351 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
15354 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
15356 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15358 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
15359 X86::RAX, X86II::MO_TLSGD);
15362 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
15368 // Get the start address of the TLS block for this module.
15369 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
15370 .getInfo<X86MachineFunctionInfo>();
15371 MFI->incNumLocalDynamicTLSAccesses();
15375 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
15376 X86II::MO_TLSLD, /*LocalDynamic=*/true);
15379 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15380 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
15381 InFlag = Chain.getValue(1);
15382 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
15383 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
15386 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
15390 unsigned char OperandFlags = X86II::MO_DTPOFF;
15391 unsigned WrapperKind = X86ISD::Wrapper;
15392 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15393 GA->getValueType(0),
15394 GA->getOffset(), OperandFlags);
15395 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15397 // Add x@dtpoff with the base.
15398 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
15401 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
15402 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15403 const EVT PtrVT, TLSModel::Model model,
15404 bool is64Bit, bool isPIC) {
15407 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
15408 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
15409 is64Bit ? 257 : 256));
15411 SDValue ThreadPointer =
15412 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
15413 MachinePointerInfo(Ptr));
15415 unsigned char OperandFlags = 0;
15416 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
15418 unsigned WrapperKind = X86ISD::Wrapper;
15419 if (model == TLSModel::LocalExec) {
15420 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
15421 } else if (model == TLSModel::InitialExec) {
15423 OperandFlags = X86II::MO_GOTTPOFF;
15424 WrapperKind = X86ISD::WrapperRIP;
15426 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
15429 llvm_unreachable("Unexpected model");
15432 // emit "addl x@ntpoff,%eax" (local exec)
15433 // or "addl x@indntpoff,%eax" (initial exec)
15434 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
15436 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
15437 GA->getOffset(), OperandFlags);
15438 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15440 if (model == TLSModel::InitialExec) {
15441 if (isPIC && !is64Bit) {
15442 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
15443 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15447 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
15448 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15451 // The address of the thread local variable is the add of the thread
15452 // pointer with the offset of the variable.
15453 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
15457 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
15459 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
15461 if (DAG.getTarget().Options.EmulatedTLS)
15462 return LowerToTLSEmulatedModel(GA, DAG);
15464 const GlobalValue *GV = GA->getGlobal();
15465 auto PtrVT = getPointerTy(DAG.getDataLayout());
15466 bool PositionIndependent = isPositionIndependent();
15468 if (Subtarget.isTargetELF()) {
15469 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
15471 case TLSModel::GeneralDynamic:
15472 if (Subtarget.is64Bit())
15473 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
15474 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
15475 case TLSModel::LocalDynamic:
15476 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
15477 Subtarget.is64Bit());
15478 case TLSModel::InitialExec:
15479 case TLSModel::LocalExec:
15480 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
15481 PositionIndependent);
15483 llvm_unreachable("Unknown TLS model.");
15486 if (Subtarget.isTargetDarwin()) {
15487 // Darwin only has one model of TLS. Lower to that.
15488 unsigned char OpFlag = 0;
15489 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
15490 X86ISD::WrapperRIP : X86ISD::Wrapper;
15492 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15493 // global base reg.
15494 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
15496 OpFlag = X86II::MO_TLVP_PIC_BASE;
15498 OpFlag = X86II::MO_TLVP;
15500 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
15501 GA->getValueType(0),
15502 GA->getOffset(), OpFlag);
15503 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
15505 // With PIC32, the address is actually $g + Offset.
15507 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
15508 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15511 // Lowering the machine isd will make sure everything is in the right
15513 SDValue Chain = DAG.getEntryNode();
15514 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15515 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
15516 SDValue Args[] = { Chain, Offset };
15517 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
15518 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
15519 DAG.getIntPtrConstant(0, DL, true),
15520 Chain.getValue(1), DL);
15522 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
15523 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15524 MFI.setAdjustsStack(true);
15526 // And our return value (tls address) is in the standard call return value
15528 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
15529 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
15532 if (Subtarget.isTargetKnownWindowsMSVC() ||
15533 Subtarget.isTargetWindowsItanium() ||
15534 Subtarget.isTargetWindowsGNU()) {
15535 // Just use the implicit TLS architecture
15536 // Need to generate something similar to:
15537 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
15539 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
15540 // mov rcx, qword [rdx+rcx*8]
15541 // mov eax, .tls$:tlsvar
15542 // [rax+rcx] contains the address
15543 // Windows 64bit: gs:0x58
15544 // Windows 32bit: fs:__tls_array
15547 SDValue Chain = DAG.getEntryNode();
15549 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
15550 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
15551 // use its literal value of 0x2C.
15552 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
15553 ? Type::getInt8PtrTy(*DAG.getContext(),
15555 : Type::getInt32PtrTy(*DAG.getContext(),
15558 SDValue TlsArray = Subtarget.is64Bit()
15559 ? DAG.getIntPtrConstant(0x58, dl)
15560 : (Subtarget.isTargetWindowsGNU()
15561 ? DAG.getIntPtrConstant(0x2C, dl)
15562 : DAG.getExternalSymbol("_tls_array", PtrVT));
15564 SDValue ThreadPointer =
15565 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
15568 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
15569 res = ThreadPointer;
15571 // Load the _tls_index variable
15572 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
15573 if (Subtarget.is64Bit())
15574 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
15575 MachinePointerInfo(), MVT::i32);
15577 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
15579 auto &DL = DAG.getDataLayout();
15581 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
15582 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
15584 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
15587 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
15589 // Get the offset of start of .tls section
15590 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15591 GA->getValueType(0),
15592 GA->getOffset(), X86II::MO_SECREL);
15593 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
15595 // The address of the thread local variable is the add of the thread
15596 // pointer with the offset of the variable.
15597 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15600 llvm_unreachable("TLS not implemented for this target.");
15603 /// Lower SRA_PARTS and friends, which return two i32 values
15604 /// and take a 2 x i32 value to shift plus a shift amount.
15605 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15606 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
15607 MVT VT = Op.getSimpleValueType();
15608 unsigned VTBits = VT.getSizeInBits();
15610 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15611 SDValue ShOpLo = Op.getOperand(0);
15612 SDValue ShOpHi = Op.getOperand(1);
15613 SDValue ShAmt = Op.getOperand(2);
15614 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15615 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15617 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15618 DAG.getConstant(VTBits - 1, dl, MVT::i8));
15619 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15620 DAG.getConstant(VTBits - 1, dl, MVT::i8))
15621 : DAG.getConstant(0, dl, VT);
15623 SDValue Tmp2, Tmp3;
15624 if (Op.getOpcode() == ISD::SHL_PARTS) {
15625 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15626 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15628 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15629 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15632 // If the shift amount is larger or equal than the width of a part we can't
15633 // rely on the results of shld/shrd. Insert a test and select the appropriate
15634 // values for large shift amounts.
15635 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15636 DAG.getConstant(VTBits, dl, MVT::i8));
15637 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15638 AndNode, DAG.getConstant(0, dl, MVT::i8));
15641 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15642 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15643 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15645 if (Op.getOpcode() == ISD::SHL_PARTS) {
15646 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15647 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15649 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15650 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15653 SDValue Ops[2] = { Lo, Hi };
15654 return DAG.getMergeValues(Ops, dl);
15657 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15658 SelectionDAG &DAG) const {
15659 SDValue Src = Op.getOperand(0);
15660 MVT SrcVT = Src.getSimpleValueType();
15661 MVT VT = Op.getSimpleValueType();
15664 if (SrcVT.isVector()) {
15665 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
15666 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15667 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15668 DAG.getUNDEF(SrcVT)));
15670 if (SrcVT == MVT::v2i1) {
15671 // For v2i1, we need to widen to v4i1 first.
15672 assert(VT == MVT::v2f64 && "Unexpected type");
15673 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src,
15674 DAG.getUNDEF(MVT::v2i1));
15675 return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(),
15676 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src));
15681 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
15682 "Unknown SINT_TO_FP to lower!");
15684 // These are really Legal; return the operand so the caller accepts it as
15686 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
15688 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15689 Subtarget.is64Bit()) {
15693 SDValue ValueToStore = Op.getOperand(0);
15694 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15695 !Subtarget.is64Bit())
15696 // Bitcasting to f64 here allows us to do a single 64-bit store from
15697 // an SSE register, avoiding the store forwarding penalty that would come
15698 // with two 32-bit stores.
15699 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15701 unsigned Size = SrcVT.getSizeInBits()/8;
15702 MachineFunction &MF = DAG.getMachineFunction();
15703 auto PtrVT = getPointerTy(MF.getDataLayout());
15704 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15705 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15706 SDValue Chain = DAG.getStore(
15707 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15708 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15709 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15712 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15714 SelectionDAG &DAG) const {
15718 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15720 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15722 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15724 unsigned ByteSize = SrcVT.getSizeInBits()/8;
15726 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15727 MachineMemOperand *MMO;
15729 int SSFI = FI->getIndex();
15730 MMO = DAG.getMachineFunction().getMachineMemOperand(
15731 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15732 MachineMemOperand::MOLoad, ByteSize, ByteSize);
15734 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15735 StackSlot = StackSlot.getOperand(1);
15737 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15738 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15740 Tys, Ops, SrcVT, MMO);
15743 Chain = Result.getValue(1);
15744 SDValue InFlag = Result.getValue(2);
15746 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15747 // shouldn't be necessary except that RFP cannot be live across
15748 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15749 MachineFunction &MF = DAG.getMachineFunction();
15750 unsigned SSFISize = Op.getValueSizeInBits()/8;
15751 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15752 auto PtrVT = getPointerTy(MF.getDataLayout());
15753 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15754 Tys = DAG.getVTList(MVT::Other);
15756 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15758 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15759 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15760 MachineMemOperand::MOStore, SSFISize, SSFISize);
15762 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15763 Ops, Op.getValueType(), MMO);
15764 Result = DAG.getLoad(
15765 Op.getValueType(), DL, Chain, StackSlot,
15766 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15772 /// 64-bit unsigned integer to double expansion.
15773 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
15774 const X86Subtarget &Subtarget) {
15775 // This algorithm is not obvious. Here it is what we're trying to output:
15778 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15779 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15781 haddpd %xmm0, %xmm0
15783 pshufd $0x4e, %xmm0, %xmm1
15789 LLVMContext *Context = DAG.getContext();
15791 // Build some magic constants.
15792 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15793 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15794 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
15795 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15797 SmallVector<Constant*,2> CV1;
15799 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15800 APInt(64, 0x4330000000000000ULL))));
15802 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15803 APInt(64, 0x4530000000000000ULL))));
15804 Constant *C1 = ConstantVector::get(CV1);
15805 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15807 // Load the 64-bit value into an XMM register.
15808 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15811 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15812 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15813 /* Alignment = */ 16);
15815 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15818 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15819 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15820 /* Alignment = */ 16);
15821 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15822 // TODO: Are there any fast-math-flags to propagate here?
15823 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15826 if (Subtarget.hasSSE3()) {
15827 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15828 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15830 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15831 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15832 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15833 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15836 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15837 DAG.getIntPtrConstant(0, dl));
15840 /// 32-bit unsigned integer to float expansion.
15841 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
15842 const X86Subtarget &Subtarget) {
15844 // FP constant to bias correct the final result.
15845 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15848 // Load the 32-bit value into an XMM register.
15849 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15852 // Zero out the upper parts of the register.
15853 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15855 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15856 DAG.getBitcast(MVT::v2f64, Load),
15857 DAG.getIntPtrConstant(0, dl));
15859 // Or the load with the bias.
15860 SDValue Or = DAG.getNode(
15861 ISD::OR, dl, MVT::v2i64,
15862 DAG.getBitcast(MVT::v2i64,
15863 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15864 DAG.getBitcast(MVT::v2i64,
15865 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15867 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15868 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15870 // Subtract the bias.
15871 // TODO: Are there any fast-math-flags to propagate here?
15872 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15874 // Handle final rounding.
15875 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
15878 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15879 const X86Subtarget &Subtarget, SDLoc &DL) {
15880 if (Op.getSimpleValueType() != MVT::v2f64)
15883 SDValue N0 = Op.getOperand(0);
15884 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15886 // Legalize to v4i32 type.
15887 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15888 DAG.getUNDEF(MVT::v2i32));
15890 if (Subtarget.hasAVX512())
15891 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15893 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15894 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15895 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15896 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15898 // Two to the power of half-word-size.
15899 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15901 // Clear upper part of LO, lower HI.
15902 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15903 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15905 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15906 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15907 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15909 // Add the two halves.
15910 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15913 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15914 const X86Subtarget &Subtarget) {
15915 // The algorithm is the following:
15916 // #ifdef __SSE4_1__
15917 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15918 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15919 // (uint4) 0x53000000, 0xaa);
15921 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15922 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15924 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15925 // return (float4) lo + fhi;
15927 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15928 // reassociate the two FADDs, and if we do that, the algorithm fails
15929 // spectacularly (PR24512).
15930 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15931 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15932 // there's also the MachineCombiner reassociations happening on Machine IR.
15933 if (DAG.getTarget().Options.UnsafeFPMath)
15937 SDValue V = Op->getOperand(0);
15938 MVT VecIntVT = V.getSimpleValueType();
15939 bool Is128 = VecIntVT == MVT::v4i32;
15940 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15941 // If we convert to something else than the supported type, e.g., to v4f64,
15943 if (VecFloatVT != Op->getSimpleValueType(0))
15946 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15947 "Unsupported custom type");
15949 // In the #idef/#else code, we have in common:
15950 // - The vector of constants:
15956 // Create the splat vector for 0x4b000000.
15957 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15958 // Create the splat vector for 0x53000000.
15959 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15961 // Create the right shift.
15962 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15963 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15966 if (Subtarget.hasSSE41()) {
15967 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15968 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15969 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15970 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15971 // Low will be bitcasted right away, so do not bother bitcasting back to its
15973 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15974 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15975 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15976 // (uint4) 0x53000000, 0xaa);
15977 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15978 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15979 // High will be bitcasted right away, so do not bother bitcasting back to
15980 // its original type.
15981 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15982 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15984 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15985 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15986 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15987 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15989 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15990 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15993 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15994 SDValue VecCstFAdd = DAG.getConstantFP(
15995 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15997 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15998 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15999 // TODO: Are there any fast-math-flags to propagate here?
16001 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
16002 // return (float4) lo + fhi;
16003 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
16004 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
16007 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
16008 const X86Subtarget &Subtarget) {
16009 SDValue N0 = Op.getOperand(0);
16010 MVT SrcVT = N0.getSimpleValueType();
16013 if (SrcVT == MVT::v2i1) {
16014 // For v2i1, we need to widen to v4i1 first.
16015 assert(Op.getValueType() == MVT::v2f64 && "Unexpected type");
16016 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0,
16017 DAG.getUNDEF(MVT::v2i1));
16018 return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64,
16019 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0));
16022 switch (SrcVT.SimpleTy) {
16024 llvm_unreachable("Custom UINT_TO_FP is not supported!");
16026 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
16029 assert(!Subtarget.hasAVX512());
16030 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
16034 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
16035 SelectionDAG &DAG) const {
16036 SDValue N0 = Op.getOperand(0);
16038 auto PtrVT = getPointerTy(DAG.getDataLayout());
16040 if (Op.getSimpleValueType().isVector())
16041 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
16043 MVT SrcVT = N0.getSimpleValueType();
16044 MVT DstVT = Op.getSimpleValueType();
16046 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
16047 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
16048 // Conversions from unsigned i32 to f32/f64 are legal,
16049 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
16053 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
16054 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
16055 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
16056 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
16057 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
16060 // Make a 64-bit buffer, and use it to build an FILD.
16061 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
16062 if (SrcVT == MVT::i32) {
16063 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
16064 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
16065 StackSlot, MachinePointerInfo());
16066 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
16067 OffsetSlot, MachinePointerInfo());
16068 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
16072 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
16073 SDValue ValueToStore = Op.getOperand(0);
16074 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
16075 // Bitcasting to f64 here allows us to do a single 64-bit store from
16076 // an SSE register, avoiding the store forwarding penalty that would come
16077 // with two 32-bit stores.
16078 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16079 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16080 MachinePointerInfo());
16081 // For i64 source, we need to add the appropriate power of 2 if the input
16082 // was negative. This is the same as the optimization in
16083 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
16084 // we must be careful to do the computation in x87 extended precision, not
16085 // in SSE. (The generic code can't know it's OK to do this, or how to.)
16086 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
16087 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16088 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16089 MachineMemOperand::MOLoad, 8, 8);
16091 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
16092 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
16093 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
16096 APInt FF(32, 0x5F800000ULL);
16098 // Check whether the sign bit is set.
16099 SDValue SignSet = DAG.getSetCC(
16100 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
16101 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
16103 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
16104 SDValue FudgePtr = DAG.getConstantPool(
16105 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
16107 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
16108 SDValue Zero = DAG.getIntPtrConstant(0, dl);
16109 SDValue Four = DAG.getIntPtrConstant(4, dl);
16110 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
16111 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
16113 // Load the value out, extending it from f32 to f80.
16114 // FIXME: Avoid the extend by constructing the right constant pool?
16115 SDValue Fudge = DAG.getExtLoad(
16116 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
16117 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
16118 /* Alignment = */ 4);
16119 // Extend everything to 80 bits to force it to be done on x87.
16120 // TODO: Are there any fast-math-flags to propagate here?
16121 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
16122 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
16123 DAG.getIntPtrConstant(0, dl));
16126 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
16127 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
16128 // just return an <SDValue(), SDValue()> pair.
16129 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
16130 // to i16, i32 or i64, and we lower it to a legal sequence.
16131 // If lowered to the final integer result we return a <result, SDValue()> pair.
16132 // Otherwise we lower it to a sequence ending with a FIST, return a
16133 // <FIST, StackSlot> pair, and the caller is responsible for loading
16134 // the final integer result from StackSlot.
16135 std::pair<SDValue,SDValue>
16136 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
16137 bool IsSigned, bool IsReplace) const {
16140 EVT DstTy = Op.getValueType();
16141 EVT TheVT = Op.getOperand(0).getValueType();
16142 auto PtrVT = getPointerTy(DAG.getDataLayout());
16144 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
16145 // f16 must be promoted before using the lowering in this routine.
16146 // fp128 does not use this lowering.
16147 return std::make_pair(SDValue(), SDValue());
16150 // If using FIST to compute an unsigned i64, we'll need some fixup
16151 // to handle values above the maximum signed i64. A FIST is always
16152 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
16153 bool UnsignedFixup = !IsSigned &&
16154 DstTy == MVT::i64 &&
16155 (!Subtarget.is64Bit() ||
16156 !isScalarFPTypeInSSEReg(TheVT));
16158 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
16159 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
16160 // The low 32 bits of the fist result will have the correct uint32 result.
16161 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
16165 assert(DstTy.getSimpleVT() <= MVT::i64 &&
16166 DstTy.getSimpleVT() >= MVT::i16 &&
16167 "Unknown FP_TO_INT to lower!");
16169 // These are really Legal.
16170 if (DstTy == MVT::i32 &&
16171 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16172 return std::make_pair(SDValue(), SDValue());
16173 if (Subtarget.is64Bit() &&
16174 DstTy == MVT::i64 &&
16175 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16176 return std::make_pair(SDValue(), SDValue());
16178 // We lower FP->int64 into FISTP64 followed by a load from a temporary
16180 MachineFunction &MF = DAG.getMachineFunction();
16181 unsigned MemSize = DstTy.getSizeInBits()/8;
16182 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16183 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16186 switch (DstTy.getSimpleVT().SimpleTy) {
16187 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
16188 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
16189 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
16190 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
16193 SDValue Chain = DAG.getEntryNode();
16194 SDValue Value = Op.getOperand(0);
16195 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
16197 if (UnsignedFixup) {
16199 // Conversion to unsigned i64 is implemented with a select,
16200 // depending on whether the source value fits in the range
16201 // of a signed i64. Let Thresh be the FP equivalent of
16202 // 0x8000000000000000ULL.
16204 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
16205 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
16206 // Fist-to-mem64 FistSrc
16207 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
16208 // to XOR'ing the high 32 bits with Adjust.
16210 // Being a power of 2, Thresh is exactly representable in all FP formats.
16211 // For X87 we'd like to use the smallest FP type for this constant, but
16212 // for DAG type consistency we have to match the FP operand type.
16214 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
16215 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
16216 bool LosesInfo = false;
16217 if (TheVT == MVT::f64)
16218 // The rounding mode is irrelevant as the conversion should be exact.
16219 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
16221 else if (TheVT == MVT::f80)
16222 Status = Thresh.convert(APFloat::x87DoubleExtended(),
16223 APFloat::rmNearestTiesToEven, &LosesInfo);
16225 assert(Status == APFloat::opOK && !LosesInfo &&
16226 "FP conversion should have been exact");
16228 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
16230 SDValue Cmp = DAG.getSetCC(DL,
16231 getSetCCResultType(DAG.getDataLayout(),
16232 *DAG.getContext(), TheVT),
16233 Value, ThreshVal, ISD::SETLT);
16234 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
16235 DAG.getConstant(0, DL, MVT::i32),
16236 DAG.getConstant(0x80000000, DL, MVT::i32));
16237 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
16238 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
16239 *DAG.getContext(), TheVT),
16240 Value, ThreshVal, ISD::SETLT);
16241 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
16244 // FIXME This causes a redundant load/store if the SSE-class value is already
16245 // in memory, such as if it is on the callstack.
16246 if (isScalarFPTypeInSSEReg(TheVT)) {
16247 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
16248 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
16249 MachinePointerInfo::getFixedStack(MF, SSFI));
16250 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
16252 Chain, StackSlot, DAG.getValueType(TheVT)
16255 MachineMemOperand *MMO =
16256 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16257 MachineMemOperand::MOLoad, MemSize, MemSize);
16258 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
16259 Chain = Value.getValue(1);
16260 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16261 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16264 MachineMemOperand *MMO =
16265 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16266 MachineMemOperand::MOStore, MemSize, MemSize);
16268 if (UnsignedFixup) {
16270 // Insert the FIST, load its result as two i32's,
16271 // and XOR the high i32 with Adjust.
16273 SDValue FistOps[] = { Chain, Value, StackSlot };
16274 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16275 FistOps, DstTy, MMO);
16278 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
16279 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
16282 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
16283 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
16285 if (Subtarget.is64Bit()) {
16286 // Join High32 and Low32 into a 64-bit result.
16287 // (High32 << 32) | Low32
16288 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
16289 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
16290 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
16291 DAG.getConstant(32, DL, MVT::i8));
16292 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
16293 return std::make_pair(Result, SDValue());
16296 SDValue ResultOps[] = { Low32, High32 };
16298 SDValue pair = IsReplace
16299 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
16300 : DAG.getMergeValues(ResultOps, DL);
16301 return std::make_pair(pair, SDValue());
16303 // Build the FP_TO_INT*_IN_MEM
16304 SDValue Ops[] = { Chain, Value, StackSlot };
16305 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16307 return std::make_pair(FIST, StackSlot);
16311 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
16312 const X86Subtarget &Subtarget) {
16313 MVT VT = Op->getSimpleValueType(0);
16314 SDValue In = Op->getOperand(0);
16315 MVT InVT = In.getSimpleValueType();
16318 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16319 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16320 (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
16321 (VT != MVT::v8i64 || InVT != MVT::v8i32) &&
16322 (VT != MVT::v8i64 || InVT != MVT::v8i16) &&
16323 (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
16324 (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
16325 (VT != MVT::v32i16 || InVT != MVT::v32i8))
16328 if (Subtarget.hasInt256())
16329 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
16331 // Optimize vectors in AVX mode:
16334 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
16335 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
16336 // Concat upper and lower parts.
16339 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
16340 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
16341 // Concat upper and lower parts.
16344 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
16345 SDValue Undef = DAG.getUNDEF(InVT);
16346 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
16347 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16348 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16350 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
16351 VT.getVectorNumElements()/2);
16353 OpLo = DAG.getBitcast(HVT, OpLo);
16354 OpHi = DAG.getBitcast(HVT, OpHi);
16356 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16359 static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
16360 const X86Subtarget &Subtarget,
16361 SelectionDAG &DAG) {
16362 MVT VT = Op->getSimpleValueType(0);
16363 SDValue In = Op->getOperand(0);
16364 MVT InVT = In.getSimpleValueType();
16365 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
16367 unsigned NumElts = VT.getVectorNumElements();
16369 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
16371 if (!Subtarget.hasBWI() &&
16372 (VT.getVectorElementType().getSizeInBits() <= 16))
16373 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
16375 // Widen to 512-bits if VLX is not supported.
16376 MVT WideVT = ExtVT;
16377 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
16378 NumElts *= 512 / ExtVT.getSizeInBits();
16379 InVT = MVT::getVectorVT(MVT::i1, NumElts);
16380 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
16381 In, DAG.getIntPtrConstant(0, DL));
16382 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
16386 SDValue One = DAG.getConstant(1, DL, WideVT);
16387 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
16389 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
16391 // Truncate if we had to extend i16/i8 above.
16393 WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16394 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
16397 // Extract back to 128/256-bit if we widened.
16399 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
16400 DAG.getIntPtrConstant(0, DL));
16402 return SelectedVal;
16405 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16406 SelectionDAG &DAG) {
16407 SDValue In = Op.getOperand(0);
16408 MVT SVT = In.getSimpleValueType();
16410 if (SVT.getVectorElementType() == MVT::i1)
16411 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
16413 if (Subtarget.hasFp256())
16414 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
16417 assert(!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() ||
16418 Op.getSimpleValueType().getVectorNumElements() !=
16419 SVT.getVectorNumElements());
16423 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
16424 /// It makes use of the fact that vectors with enough leading sign/zero bits
16425 /// prevent the PACKSS/PACKUS from saturating the results.
16426 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
16427 /// within each 128-bit lane.
16428 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
16429 const SDLoc &DL, SelectionDAG &DAG,
16430 const X86Subtarget &Subtarget) {
16431 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
16432 "Unexpected PACK opcode");
16434 // Requires SSE2 but AVX512 has fast truncate.
16435 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
16438 EVT SrcVT = In.getValueType();
16440 // No truncation required, we might get here due to recursive calls.
16441 if (SrcVT == DstVT)
16444 // We only support vector truncation to 128bits or greater from a
16445 // 256bits or greater source.
16446 unsigned DstSizeInBits = DstVT.getSizeInBits();
16447 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
16448 if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
16451 LLVMContext &Ctx = *DAG.getContext();
16452 unsigned NumElems = SrcVT.getVectorNumElements();
16453 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
16454 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
16456 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
16458 // Extract lower/upper subvectors.
16459 unsigned NumSubElts = NumElems / 2;
16460 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16461 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16463 // Pack to the largest type possible:
16464 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
16465 EVT InVT = MVT::i16, OutVT = MVT::i8;
16466 if (DstVT.getScalarSizeInBits() > 8 &&
16467 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
16472 unsigned SubSizeInBits = SrcSizeInBits / 2;
16473 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
16474 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
16476 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
16477 if (SrcVT.is256BitVector()) {
16478 Lo = DAG.getBitcast(InVT, Lo);
16479 Hi = DAG.getBitcast(InVT, Hi);
16480 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16481 return DAG.getBitcast(DstVT, Res);
16484 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
16485 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
16486 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
16487 Lo = DAG.getBitcast(InVT, Lo);
16488 Hi = DAG.getBitcast(InVT, Hi);
16489 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16491 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
16492 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
16493 Res = DAG.getBitcast(MVT::v4i64, Res);
16494 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
16496 if (DstVT.is256BitVector())
16497 return DAG.getBitcast(DstVT, Res);
16499 // If 512bit -> 128bit truncate another stage.
16500 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16501 Res = DAG.getBitcast(PackedVT, Res);
16502 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16505 // Recursively pack lower/upper subvectors, concat result and pack again.
16506 assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
16507 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
16508 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
16509 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
16511 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16512 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
16513 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16516 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
16517 const X86Subtarget &Subtarget) {
16520 MVT VT = Op.getSimpleValueType();
16521 SDValue In = Op.getOperand(0);
16522 MVT InVT = In.getSimpleValueType();
16524 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
16526 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
16527 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
16528 if (InVT.getScalarSizeInBits() <= 16) {
16529 if (Subtarget.hasBWI()) {
16530 // legal, will go to VPMOVB2M, VPMOVW2M
16531 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
16532 // We need to shift to get the lsb into sign position.
16533 // Shift packed bytes not supported natively, bitcast to word
16534 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
16535 In = DAG.getNode(ISD::SHL, DL, ExtVT,
16536 DAG.getBitcast(ExtVT, In),
16537 DAG.getConstant(ShiftInx, DL, ExtVT));
16538 In = DAG.getBitcast(InVT, In);
16540 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, In);
16542 // Use TESTD/Q, extended vector to packed dword/qword.
16543 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
16544 "Unexpected vector type.");
16545 unsigned NumElts = InVT.getVectorNumElements();
16546 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
16547 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
16548 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
16550 ShiftInx = InVT.getScalarSizeInBits() - 1;
16553 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
16554 // We need to shift to get the lsb into sign position.
16555 In = DAG.getNode(ISD::SHL, DL, InVT, In,
16556 DAG.getConstant(ShiftInx, DL, InVT));
16558 return DAG.getNode(X86ISD::TESTM, DL, VT, In, In);
16561 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
16563 MVT VT = Op.getSimpleValueType();
16564 SDValue In = Op.getOperand(0);
16565 MVT InVT = In.getSimpleValueType();
16566 unsigned InNumEltBits = InVT.getScalarSizeInBits();
16568 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
16569 "Invalid TRUNCATE operation");
16571 if (VT.getVectorElementType() == MVT::i1)
16572 return LowerTruncateVecI1(Op, DAG, Subtarget);
16574 // vpmovqb/w/d, vpmovdb/w, vpmovwb
16575 if (Subtarget.hasAVX512()) {
16576 // word to byte only under BWI
16577 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
16578 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
16579 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
16580 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
16583 // Truncate with PACKSS if we are truncating a vector with sign-bits that
16584 // extend all the way to the packed/truncated value.
16585 unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
16586 if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
16588 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
16591 // Truncate with PACKUS if we are truncating a vector with leading zero bits
16592 // that extend all the way to the packed/truncated value.
16593 // Pre-SSE41 we can only use PACKUSWB.
16595 DAG.computeKnownBits(In, Known);
16596 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
16597 if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
16599 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
16602 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16603 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
16604 if (Subtarget.hasInt256()) {
16605 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
16606 In = DAG.getBitcast(MVT::v8i32, In);
16607 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
16608 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
16609 DAG.getIntPtrConstant(0, DL));
16612 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16613 DAG.getIntPtrConstant(0, DL));
16614 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16615 DAG.getIntPtrConstant(2, DL));
16616 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16617 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16618 static const int ShufMask[] = {0, 2, 4, 6};
16619 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16622 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
16623 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16624 if (Subtarget.hasInt256()) {
16625 In = DAG.getBitcast(MVT::v32i8, In);
16627 // The PSHUFB mask:
16628 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
16629 -1, -1, -1, -1, -1, -1, -1, -1,
16630 16, 17, 20, 21, 24, 25, 28, 29,
16631 -1, -1, -1, -1, -1, -1, -1, -1 };
16632 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16633 In = DAG.getBitcast(MVT::v4i64, In);
16635 static const int ShufMask2[] = {0, 2, -1, -1};
16636 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
16637 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16638 DAG.getIntPtrConstant(0, DL));
16639 return DAG.getBitcast(VT, In);
16642 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16643 DAG.getIntPtrConstant(0, DL));
16645 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16646 DAG.getIntPtrConstant(4, DL));
16648 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16649 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16651 // The PSHUFB mask:
16652 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
16653 -1, -1, -1, -1, -1, -1, -1, -1};
16655 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16656 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16658 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16659 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16661 // The MOVLHPS Mask:
16662 static const int ShufMask2[] = {0, 1, 4, 5};
16663 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16664 return DAG.getBitcast(MVT::v8i16, res);
16667 // Handle truncation of V256 to V128 using shuffles.
16668 if (!VT.is128BitVector() || !InVT.is256BitVector())
16671 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
16673 unsigned NumElems = VT.getVectorNumElements();
16674 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16676 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16677 // Prepare truncation shuffle mask
16678 for (unsigned i = 0; i != NumElems; ++i)
16679 MaskVec[i] = i * 2;
16680 In = DAG.getBitcast(NVT, In);
16681 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16682 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16683 DAG.getIntPtrConstant(0, DL));
16686 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16687 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16688 MVT VT = Op.getSimpleValueType();
16690 if (VT.isVector()) {
16691 SDValue Src = Op.getOperand(0);
16694 if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
16695 MVT ResVT = MVT::v4i32;
16696 MVT TruncVT = MVT::v4i1;
16697 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
16698 if (!IsSigned && !Subtarget.hasVLX()) {
16699 // Widen to 512-bits.
16700 ResVT = MVT::v8i32;
16701 TruncVT = MVT::v8i1;
16702 Opc = ISD::FP_TO_UINT;
16703 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
16704 DAG.getUNDEF(MVT::v8f64),
16705 Src, DAG.getIntPtrConstant(0, dl));
16707 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
16708 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
16709 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
16710 DAG.getIntPtrConstant(0, dl));
16713 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
16714 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
16715 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
16716 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16717 DAG.getUNDEF(MVT::v2f32)));
16723 assert(!VT.isVector());
16725 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16726 IsSigned, /*IsReplace=*/ false);
16727 SDValue FIST = Vals.first, StackSlot = Vals.second;
16728 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16729 if (!FIST.getNode())
16732 if (StackSlot.getNode())
16733 // Load the result.
16734 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16736 // The node is the result.
16740 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16742 MVT VT = Op.getSimpleValueType();
16743 SDValue In = Op.getOperand(0);
16744 MVT SVT = In.getSimpleValueType();
16746 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16748 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16749 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16750 In, DAG.getUNDEF(SVT)));
16753 /// The only differences between FABS and FNEG are the mask and the logic op.
16754 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
16755 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16756 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16757 "Wrong opcode for lowering FABS or FNEG.");
16759 bool IsFABS = (Op.getOpcode() == ISD::FABS);
16761 // If this is a FABS and it has an FNEG user, bail out to fold the combination
16762 // into an FNABS. We'll lower the FABS after that if it is still in use.
16764 for (SDNode *User : Op->uses())
16765 if (User->getOpcode() == ISD::FNEG)
16769 MVT VT = Op.getSimpleValueType();
16771 bool IsF128 = (VT == MVT::f128);
16773 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16774 // decide if we should generate a 16-byte constant mask when we only need 4 or
16775 // 8 bytes for the scalar case.
16780 if (VT.isVector()) {
16782 EltVT = VT.getVectorElementType();
16783 } else if (IsF128) {
16784 // SSE instructions are used for optimized f128 logical operations.
16785 LogicVT = MVT::f128;
16788 // There are no scalar bitwise logical SSE/AVX instructions, so we
16789 // generate a 16-byte vector constant and logic op even for the scalar case.
16790 // Using a 16-byte mask allows folding the load of the mask with
16791 // the logic op, so it can save (~4 bytes) on code size.
16792 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16796 unsigned EltBits = EltVT.getSizeInBits();
16797 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16799 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16800 const fltSemantics &Sem =
16801 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16802 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16803 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16805 SDValue Op0 = Op.getOperand(0);
16806 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16808 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16809 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16811 if (VT.isVector() || IsF128)
16812 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16814 // For the scalar case extend to a 128-bit vector, perform the logic op,
16815 // and extract the scalar result back out.
16816 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16817 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16818 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16819 DAG.getIntPtrConstant(0, dl));
16822 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16823 SDValue Mag = Op.getOperand(0);
16824 SDValue Sign = Op.getOperand(1);
16827 // If the sign operand is smaller, extend it first.
16828 MVT VT = Op.getSimpleValueType();
16829 if (Sign.getSimpleValueType().bitsLT(VT))
16830 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16832 // And if it is bigger, shrink it first.
16833 if (Sign.getSimpleValueType().bitsGT(VT))
16834 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16836 // At this point the operands and the result should have the same
16837 // type, and that won't be f80 since that is not custom lowered.
16838 bool IsF128 = (VT == MVT::f128);
16839 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16840 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16841 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16842 "Unexpected type in LowerFCOPYSIGN");
16844 MVT EltVT = VT.getScalarType();
16845 const fltSemantics &Sem =
16846 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16847 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16849 // Perform all scalar logic operations as 16-byte vectors because there are no
16850 // scalar FP logic instructions in SSE.
16851 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16852 // unnecessary splats, but we might miss load folding opportunities. Should
16853 // this decision be based on OptimizeForSize?
16854 bool IsFakeVector = !VT.isVector() && !IsF128;
16857 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16859 // The mask constants are automatically splatted for vector types.
16860 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16861 SDValue SignMask = DAG.getConstantFP(
16862 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16863 SDValue MagMask = DAG.getConstantFP(
16864 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16866 // First, clear all bits but the sign bit from the second operand (sign).
16868 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16869 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16871 // Next, clear the sign bit from the first operand (magnitude).
16872 // TODO: If we had general constant folding for FP logic ops, this check
16873 // wouldn't be necessary.
16875 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16876 APFloat APF = Op0CN->getValueAPF();
16878 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16880 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16882 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16883 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16886 // OR the magnitude value with the sign bit.
16887 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16888 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16889 DAG.getIntPtrConstant(0, dl));
16892 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16893 SDValue N0 = Op.getOperand(0);
16895 MVT VT = Op.getSimpleValueType();
16897 MVT OpVT = N0.getSimpleValueType();
16898 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16899 "Unexpected type for FGETSIGN");
16901 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16902 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16903 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16904 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16905 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16906 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16910 // Check whether an OR'd tree is PTEST-able.
16911 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16912 SelectionDAG &DAG) {
16913 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16915 if (!Subtarget.hasSSE41())
16918 if (!Op->hasOneUse())
16921 SDNode *N = Op.getNode();
16924 SmallVector<SDValue, 8> Opnds;
16925 DenseMap<SDValue, unsigned> VecInMap;
16926 SmallVector<SDValue, 8> VecIns;
16927 EVT VT = MVT::Other;
16929 // Recognize a special case where a vector is casted into wide integer to
16931 Opnds.push_back(N->getOperand(0));
16932 Opnds.push_back(N->getOperand(1));
16934 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16935 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16936 // BFS traverse all OR'd operands.
16937 if (I->getOpcode() == ISD::OR) {
16938 Opnds.push_back(I->getOperand(0));
16939 Opnds.push_back(I->getOperand(1));
16940 // Re-evaluate the number of nodes to be traversed.
16941 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16945 // Quit if a non-EXTRACT_VECTOR_ELT
16946 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16949 // Quit if without a constant index.
16950 SDValue Idx = I->getOperand(1);
16951 if (!isa<ConstantSDNode>(Idx))
16954 SDValue ExtractedFromVec = I->getOperand(0);
16955 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16956 if (M == VecInMap.end()) {
16957 VT = ExtractedFromVec.getValueType();
16958 // Quit if not 128/256-bit vector.
16959 if (!VT.is128BitVector() && !VT.is256BitVector())
16961 // Quit if not the same type.
16962 if (VecInMap.begin() != VecInMap.end() &&
16963 VT != VecInMap.begin()->first.getValueType())
16965 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16966 VecIns.push_back(ExtractedFromVec);
16968 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16971 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16972 "Not extracted from 128-/256-bit vector.");
16974 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16976 for (DenseMap<SDValue, unsigned>::const_iterator
16977 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16978 // Quit if not all elements are used.
16979 if (I->second != FullMask)
16983 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16985 // Cast all vectors into TestVT for PTEST.
16986 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16987 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16989 // If more than one full vector is evaluated, OR them first before PTEST.
16990 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16991 // Each iteration will OR 2 nodes and append the result until there is only
16992 // 1 node left, i.e. the final OR'd value of all vectors.
16993 SDValue LHS = VecIns[Slot];
16994 SDValue RHS = VecIns[Slot + 1];
16995 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16998 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
17001 /// \brief return true if \c Op has a use that doesn't just read flags.
17002 static bool hasNonFlagsUse(SDValue Op) {
17003 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
17005 SDNode *User = *UI;
17006 unsigned UOpNo = UI.getOperandNo();
17007 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
17008 // Look pass truncate.
17009 UOpNo = User->use_begin().getOperandNo();
17010 User = *User->use_begin();
17013 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
17014 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
17020 /// Emit nodes that will be selected as "test Op0,Op0", or something
17022 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
17023 SelectionDAG &DAG) const {
17024 if (Op.getValueType() == MVT::i1) {
17025 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
17026 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
17027 DAG.getConstant(0, dl, MVT::i8));
17029 // CF and OF aren't always set the way we want. Determine which
17030 // of these we need.
17031 bool NeedCF = false;
17032 bool NeedOF = false;
17035 case X86::COND_A: case X86::COND_AE:
17036 case X86::COND_B: case X86::COND_BE:
17039 case X86::COND_G: case X86::COND_GE:
17040 case X86::COND_L: case X86::COND_LE:
17041 case X86::COND_O: case X86::COND_NO: {
17042 // Check if we really need to set the
17043 // Overflow flag. If NoSignedWrap is present
17044 // that is not actually needed.
17045 switch (Op->getOpcode()) {
17050 if (Op.getNode()->getFlags().hasNoSignedWrap())
17060 // See if we can use the EFLAGS value from the operand instead of
17061 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
17062 // we prove that the arithmetic won't overflow, we can't use OF or CF.
17063 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
17064 // Emit a CMP with 0, which is the TEST pattern.
17065 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17066 DAG.getConstant(0, dl, Op.getValueType()));
17068 unsigned Opcode = 0;
17069 unsigned NumOperands = 0;
17071 // Truncate operations may prevent the merge of the SETCC instruction
17072 // and the arithmetic instruction before it. Attempt to truncate the operands
17073 // of the arithmetic instruction and use a reduced bit-width instruction.
17074 bool NeedTruncation = false;
17075 SDValue ArithOp = Op;
17076 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
17077 SDValue Arith = Op->getOperand(0);
17078 // Both the trunc and the arithmetic op need to have one user each.
17079 if (Arith->hasOneUse())
17080 switch (Arith.getOpcode()) {
17087 NeedTruncation = true;
17093 // Sometimes flags can be set either with an AND or with an SRL/SHL
17094 // instruction. SRL/SHL variant should be preferred for masks longer than this
17096 const int ShiftToAndMaxMaskWidth = 32;
17097 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
17099 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
17100 // which may be the result of a CAST. We use the variable 'Op', which is the
17101 // non-casted variable when we check for possible users.
17102 switch (ArithOp.getOpcode()) {
17104 // We only want to rewrite this as a target-specific node with attached
17105 // flags if there is a reasonable chance of either using that to do custom
17106 // instructions selection that can fold some of the memory operands, or if
17107 // only the flags are used. If there are other uses, leave the node alone
17108 // and emit a test instruction.
17109 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17110 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17111 if (UI->getOpcode() != ISD::CopyToReg &&
17112 UI->getOpcode() != ISD::SETCC &&
17113 UI->getOpcode() != ISD::STORE)
17116 if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
17117 // An add of one will be selected as an INC.
17119 (!Subtarget.slowIncDec() ||
17120 DAG.getMachineFunction().getFunction().optForSize())) {
17121 Opcode = X86ISD::INC;
17126 // An add of negative one (subtract of one) will be selected as a DEC.
17127 if (C->isAllOnesValue() &&
17128 (!Subtarget.slowIncDec() ||
17129 DAG.getMachineFunction().getFunction().optForSize())) {
17130 Opcode = X86ISD::DEC;
17136 // Otherwise use a regular EFLAGS-setting add.
17137 Opcode = X86ISD::ADD;
17142 // If we have a constant logical shift that's only used in a comparison
17143 // against zero turn it into an equivalent AND. This allows turning it into
17144 // a TEST instruction later.
17145 if (ZeroCheck && Op->hasOneUse() &&
17146 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
17147 EVT VT = Op.getValueType();
17148 unsigned BitWidth = VT.getSizeInBits();
17149 unsigned ShAmt = Op->getConstantOperandVal(1);
17150 if (ShAmt >= BitWidth) // Avoid undefined shifts.
17152 APInt Mask = ArithOp.getOpcode() == ISD::SRL
17153 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
17154 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
17155 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17157 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
17158 DAG.getConstant(Mask, dl, VT));
17163 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
17164 // because a TEST instruction will be better. However, AND should be
17165 // preferred if the instruction can be combined into ANDN.
17166 if (!hasNonFlagsUse(Op)) {
17167 SDValue Op0 = ArithOp->getOperand(0);
17168 SDValue Op1 = ArithOp->getOperand(1);
17169 EVT VT = ArithOp.getValueType();
17170 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
17171 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
17172 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
17174 // If we cannot select an ANDN instruction, check if we can replace
17175 // AND+IMM64 with a shift before giving up. This is possible for masks
17176 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
17177 if (!isProperAndn) {
17181 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
17182 auto *CN = dyn_cast<ConstantSDNode>(Op1);
17186 const APInt &Mask = CN->getAPIntValue();
17187 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17188 break; // Prefer TEST instruction.
17190 unsigned BitWidth = Mask.getBitWidth();
17191 unsigned LeadingOnes = Mask.countLeadingOnes();
17192 unsigned TrailingZeros = Mask.countTrailingZeros();
17194 if (LeadingOnes + TrailingZeros == BitWidth) {
17195 assert(TrailingZeros < VT.getSizeInBits() &&
17196 "Shift amount should be less than the type width");
17197 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17198 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
17199 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
17203 unsigned LeadingZeros = Mask.countLeadingZeros();
17204 unsigned TrailingOnes = Mask.countTrailingOnes();
17206 if (LeadingZeros + TrailingOnes == BitWidth) {
17207 assert(LeadingZeros < VT.getSizeInBits() &&
17208 "Shift amount should be less than the type width");
17209 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17210 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
17211 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
17222 // Similar to ISD::ADD above, check if the uses will preclude useful
17223 // lowering of the target-specific node.
17224 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17225 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17226 if (UI->getOpcode() != ISD::CopyToReg &&
17227 UI->getOpcode() != ISD::SETCC &&
17228 UI->getOpcode() != ISD::STORE)
17231 // Otherwise use a regular EFLAGS-setting instruction.
17232 switch (ArithOp.getOpcode()) {
17233 default: llvm_unreachable("unexpected operator!");
17234 case ISD::SUB: Opcode = X86ISD::SUB; break;
17235 case ISD::XOR: Opcode = X86ISD::XOR; break;
17236 case ISD::AND: Opcode = X86ISD::AND; break;
17238 if (!NeedTruncation && ZeroCheck) {
17239 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
17242 Opcode = X86ISD::OR;
17256 return SDValue(Op.getNode(), 1);
17262 // If we found that truncation is beneficial, perform the truncation and
17264 if (NeedTruncation) {
17265 EVT VT = Op.getValueType();
17266 SDValue WideVal = Op->getOperand(0);
17267 EVT WideVT = WideVal.getValueType();
17268 unsigned ConvertedOp = 0;
17269 // Use a target machine opcode to prevent further DAGCombine
17270 // optimizations that may separate the arithmetic operations
17271 // from the setcc node.
17272 switch (WideVal.getOpcode()) {
17274 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
17275 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
17276 case ISD::AND: ConvertedOp = X86ISD::AND; break;
17277 case ISD::OR: ConvertedOp = X86ISD::OR; break;
17278 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
17282 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17283 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
17284 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
17285 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
17286 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
17292 // Emit a CMP with 0, which is the TEST pattern.
17293 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17294 DAG.getConstant(0, dl, Op.getValueType()));
17296 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17297 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
17299 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
17300 DAG.ReplaceAllUsesWith(Op, New);
17301 return SDValue(New.getNode(), 1);
17304 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
17306 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
17307 const SDLoc &dl, SelectionDAG &DAG) const {
17308 if (isNullConstant(Op1))
17309 return EmitTest(Op0, X86CC, dl, DAG);
17311 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
17312 "Unexpected comparison operation for MVT::i1 operands");
17314 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
17315 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
17316 // Only promote the compare up to I32 if it is a 16 bit operation
17317 // with an immediate. 16 bit immediates are to be avoided.
17318 if ((Op0.getValueType() == MVT::i16 &&
17319 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
17320 !DAG.getMachineFunction().getFunction().optForMinSize() &&
17321 !Subtarget.isAtom()) {
17322 unsigned ExtendOp =
17323 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17324 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
17325 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
17327 // Use SUB instead of CMP to enable CSE between SUB and CMP.
17328 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
17329 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
17330 return SDValue(Sub.getNode(), 1);
17332 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
17335 /// Convert a comparison if required by the subtarget.
17336 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
17337 SelectionDAG &DAG) const {
17338 // If the subtarget does not support the FUCOMI instruction, floating-point
17339 // comparisons have to be converted.
17340 if (Subtarget.hasCMov() ||
17341 Cmp.getOpcode() != X86ISD::CMP ||
17342 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
17343 !Cmp.getOperand(1).getValueType().isFloatingPoint())
17346 // The instruction selector will select an FUCOM instruction instead of
17347 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
17348 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
17349 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
17351 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
17352 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
17353 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
17354 DAG.getConstant(8, dl, MVT::i8));
17355 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
17357 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
17358 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
17359 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
17362 /// Check if replacement of SQRT with RSQRT should be disabled.
17363 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
17364 EVT VT = Op.getValueType();
17366 // We never want to use both SQRT and RSQRT instructions for the same input.
17367 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
17371 return Subtarget.hasFastVectorFSQRT();
17372 return Subtarget.hasFastScalarFSQRT();
17375 /// The minimum architected relative accuracy is 2^-12. We need one
17376 /// Newton-Raphson step to have a good float result (24 bits of precision).
17377 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
17378 SelectionDAG &DAG, int Enabled,
17379 int &RefinementSteps,
17380 bool &UseOneConstNR,
17381 bool Reciprocal) const {
17382 EVT VT = Op.getValueType();
17384 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
17385 // TODO: Add support for AVX512 (v16f32).
17386 // It is likely not profitable to do this for f64 because a double-precision
17387 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
17388 // instructions: convert to single, rsqrtss, convert back to double, refine
17389 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
17390 // along with FMA, this could be a throughput win.
17391 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
17392 // after legalize types.
17393 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17394 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
17395 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
17396 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17397 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17398 RefinementSteps = 1;
17400 UseOneConstNR = false;
17401 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
17406 /// The minimum architected relative accuracy is 2^-12. We need one
17407 /// Newton-Raphson step to have a good float result (24 bits of precision).
17408 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
17410 int &RefinementSteps) const {
17411 EVT VT = Op.getValueType();
17413 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
17414 // TODO: Add support for AVX512 (v16f32).
17415 // It is likely not profitable to do this for f64 because a double-precision
17416 // reciprocal estimate with refinement on x86 prior to FMA requires
17417 // 15 instructions: convert to single, rcpss, convert back to double, refine
17418 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
17419 // along with FMA, this could be a throughput win.
17421 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17422 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
17423 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17424 // Enable estimate codegen with 1 refinement step for vector division.
17425 // Scalar division estimates are disabled because they break too much
17426 // real-world code. These defaults are intended to match GCC behavior.
17427 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
17430 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17431 RefinementSteps = 1;
17433 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
17438 /// If we have at least two divisions that use the same divisor, convert to
17439 /// multiplication by a reciprocal. This may need to be adjusted for a given
17440 /// CPU if a division's cost is not at least twice the cost of a multiplication.
17441 /// This is because we still need one division to calculate the reciprocal and
17442 /// then we need two multiplies by that reciprocal as replacements for the
17443 /// original divisions.
17444 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
17448 /// Helper for creating a X86ISD::SETCC node.
17449 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
17450 SelectionDAG &DAG) {
17451 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17452 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
17455 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
17456 /// according to equal/not-equal condition code \p CC.
17457 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
17458 const SDLoc &dl, SelectionDAG &DAG) {
17459 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
17460 // instruction. Since the shift amount is in-range-or-undefined, we know
17461 // that doing a bittest on the i32 value is ok. We extend to i32 because
17462 // the encoding for the i16 version is larger than the i32 version.
17463 // Also promote i16 to i32 for performance / code size reason.
17464 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
17465 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
17467 // See if we can use the 32-bit instruction instead of the 64-bit one for a
17468 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
17469 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
17470 // known to be zero.
17471 if (Src.getValueType() == MVT::i64 &&
17472 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
17473 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
17475 // If the operand types disagree, extend the shift amount to match. Since
17476 // BT ignores high bits (like shifts) we can use anyextend.
17477 if (Src.getValueType() != BitNo.getValueType())
17478 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
17480 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
17481 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
17482 return getSETCC(Cond, BT, dl , DAG);
17485 /// Result of 'and' is compared against zero. Change to a BT node if possible.
17486 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
17487 const SDLoc &dl, SelectionDAG &DAG) {
17488 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
17489 SDValue Op0 = And.getOperand(0);
17490 SDValue Op1 = And.getOperand(1);
17491 if (Op0.getOpcode() == ISD::TRUNCATE)
17492 Op0 = Op0.getOperand(0);
17493 if (Op1.getOpcode() == ISD::TRUNCATE)
17494 Op1 = Op1.getOperand(0);
17497 if (Op1.getOpcode() == ISD::SHL)
17498 std::swap(Op0, Op1);
17499 if (Op0.getOpcode() == ISD::SHL) {
17500 if (isOneConstant(Op0.getOperand(0))) {
17501 // If we looked past a truncate, check that it's only truncating away
17503 unsigned BitWidth = Op0.getValueSizeInBits();
17504 unsigned AndBitWidth = And.getValueSizeInBits();
17505 if (BitWidth > AndBitWidth) {
17507 DAG.computeKnownBits(Op0, Known);
17508 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
17512 RHS = Op0.getOperand(1);
17514 } else if (Op1.getOpcode() == ISD::Constant) {
17515 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
17516 uint64_t AndRHSVal = AndRHS->getZExtValue();
17517 SDValue AndLHS = Op0;
17519 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
17520 LHS = AndLHS.getOperand(0);
17521 RHS = AndLHS.getOperand(1);
17524 // Use BT if the immediate can't be encoded in a TEST instruction.
17525 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
17527 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
17532 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
17537 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
17539 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
17544 // SSE Condition code mapping:
17553 switch (SetCCOpcode) {
17554 default: llvm_unreachable("Unexpected SETCC condition");
17556 case ISD::SETEQ: SSECC = 0; break;
17558 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
17560 case ISD::SETOLT: SSECC = 1; break;
17562 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
17564 case ISD::SETOLE: SSECC = 2; break;
17565 case ISD::SETUO: SSECC = 3; break;
17567 case ISD::SETNE: SSECC = 4; break;
17568 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
17569 case ISD::SETUGE: SSECC = 5; break;
17570 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17571 case ISD::SETUGT: SSECC = 6; break;
17572 case ISD::SETO: SSECC = 7; break;
17573 case ISD::SETUEQ: SSECC = 8; break;
17574 case ISD::SETONE: SSECC = 12; break;
17577 std::swap(Op0, Op1);
17582 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17583 /// concatenate the result back.
17584 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17585 MVT VT = Op.getSimpleValueType();
17587 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
17588 "Unsupported value type for operation");
17590 unsigned NumElems = VT.getVectorNumElements();
17592 SDValue CC = Op.getOperand(2);
17594 // Extract the LHS vectors
17595 SDValue LHS = Op.getOperand(0);
17596 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17597 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17599 // Extract the RHS vectors
17600 SDValue RHS = Op.getOperand(1);
17601 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17602 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17604 // Issue the operation on the smaller types and concatenate the result back
17605 MVT EltVT = VT.getVectorElementType();
17606 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17607 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17608 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17609 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
17612 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17613 SDValue Op0 = Op.getOperand(0);
17614 SDValue Op1 = Op.getOperand(1);
17615 SDValue CC = Op.getOperand(2);
17616 MVT VT = Op.getSimpleValueType();
17619 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
17620 "Unexpected type for boolean compare operation");
17621 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17622 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17623 DAG.getConstant(-1, dl, VT));
17624 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17625 DAG.getConstant(-1, dl, VT));
17626 switch (SetCCOpcode) {
17627 default: llvm_unreachable("Unexpected SETCC condition");
17629 // (x == y) -> ~(x ^ y)
17630 return DAG.getNode(ISD::XOR, dl, VT,
17631 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17632 DAG.getConstant(-1, dl, VT));
17634 // (x != y) -> (x ^ y)
17635 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17638 // (x > y) -> (x & ~y)
17639 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17642 // (x < y) -> (~x & y)
17643 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17646 // (x <= y) -> (~x | y)
17647 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17650 // (x >=y) -> (x | ~y)
17651 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17655 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17657 SDValue Op0 = Op.getOperand(0);
17658 SDValue Op1 = Op.getOperand(1);
17659 SDValue CC = Op.getOperand(2);
17660 MVT VT = Op.getSimpleValueType();
17663 assert(VT.getVectorElementType() == MVT::i1 &&
17664 "Cannot set masked compare for this operation");
17666 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17668 bool Unsigned = false;
17671 switch (SetCCOpcode) {
17672 default: llvm_unreachable("Unexpected SETCC condition");
17673 case ISD::SETNE: SSECC = 4; break;
17674 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
17675 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
17676 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
17677 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
17678 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17679 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17680 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
17681 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
17682 case ISD::SETLE: SSECC = 2; break;
17686 std::swap(Op0, Op1);
17688 // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.
17689 if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {
17690 SDValue A = peekThroughBitcasts(Op0);
17691 if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&
17692 ISD::isBuildVectorAllZeros(Op1.getNode())) {
17693 MVT VT0 = Op0.getSimpleValueType();
17694 SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
17695 SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
17696 return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
17702 return DAG.getNode(Opc, dl, VT, Op0, Op1);
17703 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17704 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17705 DAG.getConstant(SSECC, dl, MVT::i8));
17708 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17709 /// operand \p Op1. If non-trivial (for example because it's not constant)
17710 /// return an empty value.
17711 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17712 SelectionDAG &DAG) {
17713 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17717 MVT VT = Op1.getSimpleValueType();
17718 MVT EVT = VT.getVectorElementType();
17719 unsigned n = VT.getVectorNumElements();
17720 SmallVector<SDValue, 8> ULTOp1;
17722 for (unsigned i = 0; i < n; ++i) {
17723 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17724 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17727 // Avoid underflow.
17728 APInt Val = Elt->getAPIntValue();
17732 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17735 return DAG.getBuildVector(VT, dl, ULTOp1);
17738 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17739 SelectionDAG &DAG) {
17740 SDValue Op0 = Op.getOperand(0);
17741 SDValue Op1 = Op.getOperand(1);
17742 SDValue CC = Op.getOperand(2);
17743 MVT VT = Op.getSimpleValueType();
17744 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17745 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17750 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17751 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17755 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17756 assert(VT.getVectorNumElements() <= 16);
17757 Opc = X86ISD::CMPM;
17759 Opc = X86ISD::CMPP;
17760 // The SSE/AVX packed FP comparison nodes are defined with a
17761 // floating-point vector result that matches the operand type. This allows
17762 // them to work with an SSE1 target (integer vector types are not legal).
17763 VT = Op0.getSimpleValueType();
17766 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17767 // emit two comparisons and a logic op to tie them together.
17769 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17770 if (SSECC >= 8 && !Subtarget.hasAVX()) {
17771 // LLVM predicate is SETUEQ or SETONE.
17773 unsigned CombineOpc;
17774 if (Cond == ISD::SETUEQ) {
17777 CombineOpc = X86ISD::FOR;
17779 assert(Cond == ISD::SETONE);
17782 CombineOpc = X86ISD::FAND;
17785 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17786 DAG.getConstant(CC0, dl, MVT::i8));
17787 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17788 DAG.getConstant(CC1, dl, MVT::i8));
17789 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17791 // Handle all other FP comparisons here.
17792 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17793 DAG.getConstant(SSECC, dl, MVT::i8));
17796 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17797 // result type of SETCC. The bitcast is expected to be optimized away
17798 // during combining/isel.
17799 if (Opc == X86ISD::CMPP)
17800 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17805 MVT VTOp0 = Op0.getSimpleValueType();
17806 assert(VTOp0 == Op1.getSimpleValueType() &&
17807 "Expected operands with same type!");
17808 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17809 "Invalid number of packed elements for source and destination!");
17811 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17812 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17813 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17814 // legalizer firstly checks if the first operand in input to the setcc has
17815 // a legal type. If so, then it promotes the return type to that same type.
17816 // Otherwise, the return type is promoted to the 'next legal type' which,
17817 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17819 // We reach this code only if the following two conditions are met:
17820 // 1. Both return type and operand type have been promoted to wider types
17821 // by the type legalizer.
17822 // 2. The original operand type has been promoted to a 256-bit vector.
17824 // Note that condition 2. only applies for AVX targets.
17825 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17826 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17829 // The non-AVX512 code below works under the assumption that source and
17830 // destination types are the same.
17831 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17832 "Value types for source and destination must be the same!");
17834 // Break 256-bit integer vector compare into smaller ones.
17835 if (VT.is256BitVector() && !Subtarget.hasInt256())
17836 return Lower256IntVSETCC(Op, DAG);
17838 // Operands are boolean (vectors of i1)
17839 MVT OpVT = Op1.getSimpleValueType();
17840 if (OpVT.getVectorElementType() == MVT::i1)
17841 return LowerBoolVSETCC_AVX512(Op, DAG);
17843 // The result is boolean, but operands are int/float
17844 if (VT.getVectorElementType() == MVT::i1) {
17845 // In AVX-512 architecture setcc returns mask with i1 elements,
17846 // But there is no compare instruction for i8 and i16 elements in KNL.
17847 // In this case use SSE compare
17848 bool UseAVX512Inst =
17849 (OpVT.is512BitVector() ||
17850 OpVT.getScalarSizeInBits() >= 32 ||
17851 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17854 return LowerIntVSETCC_AVX512(Op, DAG);
17856 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17857 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17860 // Lower using XOP integer comparisons.
17861 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17862 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17863 // Translate compare code to XOP PCOM compare mode.
17864 unsigned CmpMode = 0;
17866 default: llvm_unreachable("Unexpected SETCC condition");
17868 case ISD::SETLT: CmpMode = 0x00; break;
17870 case ISD::SETLE: CmpMode = 0x01; break;
17872 case ISD::SETGT: CmpMode = 0x02; break;
17874 case ISD::SETGE: CmpMode = 0x03; break;
17875 case ISD::SETEQ: CmpMode = 0x04; break;
17876 case ISD::SETNE: CmpMode = 0x05; break;
17879 // Are we comparing unsigned or signed integers?
17881 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
17883 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17884 DAG.getConstant(CmpMode, dl, MVT::i8));
17887 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
17888 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
17889 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
17890 SDValue BC0 = peekThroughBitcasts(Op0);
17891 if (BC0.getOpcode() == ISD::AND) {
17893 SmallVector<APInt, 64> EltBits;
17894 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
17895 VT.getScalarSizeInBits(), UndefElts,
17896 EltBits, false, false)) {
17897 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
17899 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
17905 // We are handling one of the integer comparisons here. Since SSE only has
17906 // GT and EQ comparisons for integer, swapping operands and multiple
17907 // operations may be required for some comparisons.
17908 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
17910 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
17911 Cond == ISD::SETGE || Cond == ISD::SETUGE;
17912 bool Invert = Cond == ISD::SETNE ||
17913 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
17915 // If both operands are known non-negative, then an unsigned compare is the
17916 // same as a signed compare and there's no need to flip signbits.
17917 // TODO: We could check for more general simplifications here since we're
17918 // computing known bits.
17919 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
17920 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
17922 // Special case: Use min/max operations for SETULE/SETUGE
17923 MVT VET = VT.getVectorElementType();
17925 (Subtarget.hasAVX512() && VET == MVT::i64) ||
17926 (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) ||
17927 (Subtarget.hasSSE2() && (VET == MVT::i8));
17928 bool MinMax = false;
17932 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17933 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17937 Swap = Invert = FlipSigns = false;
17940 bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17941 bool Subus = false;
17942 if (!MinMax && HasSubus) {
17943 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17945 // t = psubus Op0, Op1
17946 // pcmpeq t, <0..0>
17949 case ISD::SETULT: {
17950 // If the comparison is against a constant we can turn this into a
17951 // setule. With psubus, setule does not require a swap. This is
17952 // beneficial because the constant in the register is no longer
17953 // destructed as the destination so it can be hoisted out of a loop.
17954 // Only do this pre-AVX since vpcmp* is no longer destructive.
17955 if (Subtarget.hasAVX())
17957 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17959 Subus = true; Invert = false; Swap = false;
17963 // Psubus is better than flip-sign because it requires no inversion.
17964 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17965 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17969 Opc = X86ISD::SUBUS;
17975 std::swap(Op0, Op1);
17977 // Check that the operation in question is available (most are plain SSE2,
17978 // but PCMPGTQ and PCMPEQQ have different requirements).
17979 if (VT == MVT::v2i64) {
17980 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17981 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17983 // First cast everything to the right type.
17984 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17985 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17987 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17988 // bits of the inputs before performing those operations. The lower
17989 // compare is always unsigned.
17992 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17994 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17995 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17996 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17998 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17999 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
18001 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
18002 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
18003 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
18005 // Create masks for only the low parts/high parts of the 64 bit integers.
18006 static const int MaskHi[] = { 1, 1, 3, 3 };
18007 static const int MaskLo[] = { 0, 0, 2, 2 };
18008 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
18009 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
18010 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
18012 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
18013 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
18016 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18018 return DAG.getBitcast(VT, Result);
18021 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
18022 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
18023 // pcmpeqd + pshufd + pand.
18024 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
18026 // First cast everything to the right type.
18027 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18028 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18031 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
18033 // Make sure the lower and upper halves are both all-ones.
18034 static const int Mask[] = { 1, 0, 3, 2 };
18035 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
18036 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
18039 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18041 return DAG.getBitcast(VT, Result);
18045 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18046 // bits of the inputs before performing those operations.
18048 MVT EltVT = VT.getVectorElementType();
18049 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
18051 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
18052 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
18055 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18057 // If the logical-not of the result is required, perform that now.
18059 Result = DAG.getNOT(dl, Result, VT);
18062 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
18065 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
18066 getZeroVector(VT, Subtarget, DAG, dl));
18071 // Try to select this as a KTEST+SETCC if possible.
18072 static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
18073 const SDLoc &dl, SelectionDAG &DAG,
18074 const X86Subtarget &Subtarget) {
18075 // Only support equality comparisons.
18076 if (CC != ISD::SETEQ && CC != ISD::SETNE)
18079 // Must be a bitcast from vXi1.
18080 if (Op0.getOpcode() != ISD::BITCAST)
18083 Op0 = Op0.getOperand(0);
18084 MVT VT = Op0.getSimpleValueType();
18085 if (!(Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) &&
18086 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
18089 X86::CondCode X86CC;
18090 if (isNullConstant(Op1)) {
18091 X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
18095 SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0);
18096 return getSETCC(X86CC, KTEST, dl, DAG);
18099 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
18101 MVT VT = Op.getSimpleValueType();
18103 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
18105 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
18106 SDValue Op0 = Op.getOperand(0);
18107 SDValue Op1 = Op.getOperand(1);
18109 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18111 // Optimize to BT if possible.
18112 // Lower (X & (1 << N)) == 0 to BT(X, N).
18113 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
18114 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
18115 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
18116 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18117 if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
18121 // Try to lower using KTEST.
18122 if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
18125 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
18127 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
18128 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18130 // If the input is a setcc, then reuse the input setcc or use a new one with
18131 // the inverted condition.
18132 if (Op0.getOpcode() == X86ISD::SETCC) {
18133 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
18134 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
18138 CCode = X86::GetOppositeBranchCondition(CCode);
18139 return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
18143 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
18144 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
18145 if (X86CC == X86::COND_INVALID)
18148 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
18149 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
18150 return getSETCC(X86CC, EFLAGS, dl, DAG);
18153 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
18154 SDValue LHS = Op.getOperand(0);
18155 SDValue RHS = Op.getOperand(1);
18156 SDValue Carry = Op.getOperand(2);
18157 SDValue Cond = Op.getOperand(3);
18160 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
18161 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
18163 // Recreate the carry if needed.
18164 EVT CarryVT = Carry.getValueType();
18165 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
18166 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
18167 Carry, DAG.getConstant(NegOne, DL, CarryVT));
18169 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18170 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
18171 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
18174 /// Return true if opcode is a X86 logical comparison.
18175 static bool isX86LogicalCmp(SDValue Op) {
18176 unsigned Opc = Op.getOpcode();
18177 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
18178 Opc == X86ISD::SAHF)
18180 if (Op.getResNo() == 1 &&
18181 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
18182 Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
18183 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
18184 Opc == X86ISD::XOR || Opc == X86ISD::AND))
18187 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
18193 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
18194 if (V.getOpcode() != ISD::TRUNCATE)
18197 SDValue VOp0 = V.getOperand(0);
18198 unsigned InBits = VOp0.getValueSizeInBits();
18199 unsigned Bits = V.getValueSizeInBits();
18200 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
18203 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
18204 bool AddTest = true;
18205 SDValue Cond = Op.getOperand(0);
18206 SDValue Op1 = Op.getOperand(1);
18207 SDValue Op2 = Op.getOperand(2);
18209 MVT VT = Op1.getSimpleValueType();
18212 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
18213 // are available or VBLENDV if AVX is available.
18214 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
18215 if (Cond.getOpcode() == ISD::SETCC &&
18216 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
18217 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
18218 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
18219 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
18220 unsigned SSECC = translateX86FSETCC(
18221 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
18223 if (Subtarget.hasAVX512()) {
18224 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
18225 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
18226 assert(!VT.isVector() && "Not a scalar type?");
18227 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18230 if (SSECC < 8 || Subtarget.hasAVX()) {
18231 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
18232 DAG.getConstant(SSECC, DL, MVT::i8));
18234 // If we have AVX, we can use a variable vector select (VBLENDV) instead
18235 // of 3 logic instructions for size savings and potentially speed.
18236 // Unfortunately, there is no scalar form of VBLENDV.
18238 // If either operand is a constant, don't try this. We can expect to
18239 // optimize away at least one of the logic instructions later in that
18240 // case, so that sequence would be faster than a variable blend.
18242 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
18243 // uses XMM0 as the selection register. That may need just as many
18244 // instructions as the AND/ANDN/OR sequence due to register moves, so
18247 if (Subtarget.hasAVX() &&
18248 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
18250 // Convert to vectors, do a VSELECT, and convert back to scalar.
18251 // All of the conversions should be optimized away.
18253 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
18254 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
18255 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
18256 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
18258 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
18259 VCmp = DAG.getBitcast(VCmpVT, VCmp);
18261 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
18263 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18264 VSel, DAG.getIntPtrConstant(0, DL));
18266 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
18267 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
18268 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
18272 // AVX512 fallback is to lower selects of scalar floats to masked moves.
18273 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
18274 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
18275 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18278 // For v64i1 without 64-bit support we need to split and rejoin.
18279 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
18280 assert(Subtarget.hasBWI() && "Expected BWI to be legal");
18281 SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
18282 SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
18283 SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
18284 SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
18285 SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
18286 SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
18287 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
18290 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
18292 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
18293 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
18294 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
18295 Op1Scalar = Op1.getOperand(0);
18297 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
18298 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
18299 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
18300 Op2Scalar = Op2.getOperand(0);
18301 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
18302 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
18303 Op1Scalar, Op2Scalar);
18304 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
18305 return DAG.getBitcast(VT, newSelect);
18306 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
18307 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
18308 DAG.getIntPtrConstant(0, DL));
18312 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
18313 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
18314 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18315 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
18316 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18317 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
18318 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
18319 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
18322 if (Cond.getOpcode() == ISD::SETCC) {
18323 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
18325 // If the condition was updated, it's possible that the operands of the
18326 // select were also updated (for example, EmitTest has a RAUW). Refresh
18327 // the local references to the select operands in case they got stale.
18328 Op1 = Op.getOperand(1);
18329 Op2 = Op.getOperand(2);
18333 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
18334 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
18335 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
18336 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
18337 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
18338 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
18339 if (Cond.getOpcode() == X86ISD::SETCC &&
18340 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
18341 isNullConstant(Cond.getOperand(1).getOperand(1))) {
18342 SDValue Cmp = Cond.getOperand(1);
18343 unsigned CondCode =
18344 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
18346 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18347 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
18348 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
18349 SDValue CmpOp0 = Cmp.getOperand(0);
18351 // Apply further optimizations for special cases
18352 // (select (x != 0), -1, 0) -> neg & sbb
18353 // (select (x == 0), 0, -1) -> neg & sbb
18354 if (isNullConstant(Y) &&
18355 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
18356 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
18357 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
18358 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
18359 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18360 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18361 SDValue(Neg.getNode(), 1));
18365 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
18366 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
18367 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18369 SDValue Res = // Res = 0 or -1.
18370 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18371 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
18373 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
18374 Res = DAG.getNOT(DL, Res, Res.getValueType());
18376 if (!isNullConstant(Op2))
18377 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
18379 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
18380 Cmp.getOperand(0).getOpcode() == ISD::AND &&
18381 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
18382 SDValue CmpOp0 = Cmp.getOperand(0);
18383 SDValue Src1, Src2;
18384 // true if Op2 is XOR or OR operator and one of its operands
18386 // ( a , a op b) || ( b , a op b)
18387 auto isOrXorPattern = [&]() {
18388 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
18389 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
18391 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
18398 if (isOrXorPattern()) {
18400 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
18401 // we need mask of all zeros or ones with same size of the other
18403 if (CmpSz > VT.getSizeInBits())
18404 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
18405 else if (CmpSz < VT.getSizeInBits())
18406 Neg = DAG.getNode(ISD::AND, DL, VT,
18407 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
18408 DAG.getConstant(1, DL, VT));
18411 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
18412 Neg); // -(and (x, 0x1))
18413 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
18414 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
18419 // Look past (and (setcc_carry (cmp ...)), 1).
18420 if (Cond.getOpcode() == ISD::AND &&
18421 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18422 isOneConstant(Cond.getOperand(1)))
18423 Cond = Cond.getOperand(0);
18425 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18426 // setting operand in place of the X86ISD::SETCC.
18427 unsigned CondOpcode = Cond.getOpcode();
18428 if (CondOpcode == X86ISD::SETCC ||
18429 CondOpcode == X86ISD::SETCC_CARRY) {
18430 CC = Cond.getOperand(0);
18432 SDValue Cmp = Cond.getOperand(1);
18433 unsigned Opc = Cmp.getOpcode();
18434 MVT VT = Op.getSimpleValueType();
18436 bool IllegalFPCMov = false;
18437 if (VT.isFloatingPoint() && !VT.isVector() &&
18438 !isScalarFPTypeInSSEReg(VT)) // FPStack?
18439 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
18441 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
18442 Opc == X86ISD::BT) { // FIXME
18446 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18447 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18448 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18449 Cond.getOperand(0).getValueType() != MVT::i8)) {
18450 SDValue LHS = Cond.getOperand(0);
18451 SDValue RHS = Cond.getOperand(1);
18452 unsigned X86Opcode;
18455 switch (CondOpcode) {
18456 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18457 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18458 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18459 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18460 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18461 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18462 default: llvm_unreachable("unexpected overflowing operator");
18464 if (CondOpcode == ISD::UMULO)
18465 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18468 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18470 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
18472 if (CondOpcode == ISD::UMULO)
18473 Cond = X86Op.getValue(2);
18475 Cond = X86Op.getValue(1);
18477 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
18482 // Look past the truncate if the high bits are known zero.
18483 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18484 Cond = Cond.getOperand(0);
18486 // We know the result of AND is compared against zero. Try to match
18488 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
18489 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
18490 CC = NewSetCC.getOperand(0);
18491 Cond = NewSetCC.getOperand(1);
18498 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
18499 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
18502 // a < b ? -1 : 0 -> RES = ~setcc_carry
18503 // a < b ? 0 : -1 -> RES = setcc_carry
18504 // a >= b ? -1 : 0 -> RES = setcc_carry
18505 // a >= b ? 0 : -1 -> RES = ~setcc_carry
18506 if (Cond.getOpcode() == X86ISD::SUB) {
18507 Cond = ConvertCmpIfNecessary(Cond, DAG);
18508 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
18510 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
18511 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18512 (isNullConstant(Op1) || isNullConstant(Op2))) {
18513 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18514 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18516 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
18517 return DAG.getNOT(DL, Res, Res.getValueType());
18522 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
18523 // widen the cmov and push the truncate through. This avoids introducing a new
18524 // branch during isel and doesn't add any extensions.
18525 if (Op.getValueType() == MVT::i8 &&
18526 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
18527 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
18528 if (T1.getValueType() == T2.getValueType() &&
18529 // Blacklist CopyFromReg to avoid partial register stalls.
18530 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
18531 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
18533 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
18537 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
18538 // condition is true.
18539 SDValue Ops[] = { Op2, Op1, CC, Cond };
18540 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
18543 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
18544 const X86Subtarget &Subtarget,
18545 SelectionDAG &DAG) {
18546 MVT VT = Op->getSimpleValueType(0);
18547 SDValue In = Op->getOperand(0);
18548 MVT InVT = In.getSimpleValueType();
18549 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
18550 MVT VTElt = VT.getVectorElementType();
18553 unsigned NumElts = VT.getVectorNumElements();
18555 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
18557 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)
18558 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
18560 // Widen to 512-bits if VLX is not supported.
18561 MVT WideVT = ExtVT;
18562 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
18563 NumElts *= 512 / ExtVT.getSizeInBits();
18564 InVT = MVT::getVectorVT(MVT::i1, NumElts);
18565 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
18566 In, DAG.getIntPtrConstant(0, dl));
18567 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
18571 MVT WideEltVT = WideVT.getVectorElementType();
18572 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
18573 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
18574 V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG);
18576 SDValue NegOne = getOnesVector(WideVT, DAG, dl);
18577 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
18578 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
18581 // Truncate if we had to extend i16/i8 above.
18583 WideVT = MVT::getVectorVT(VTElt, NumElts);
18584 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
18587 // Extract back to 128/256-bit if we widened.
18589 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
18590 DAG.getIntPtrConstant(0, dl));
18595 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18596 SelectionDAG &DAG) {
18597 SDValue In = Op->getOperand(0);
18598 MVT InVT = In.getSimpleValueType();
18600 if (InVT.getVectorElementType() == MVT::i1)
18601 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
18603 if (Subtarget.hasFp256())
18604 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
18610 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
18611 // For sign extend this needs to handle all vector sizes and SSE4.1 and
18612 // non-SSE4.1 targets. For zero extend this should only handle inputs of
18613 // MVT::v64i8 when BWI is not supported, but AVX512 is.
18614 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
18615 const X86Subtarget &Subtarget,
18616 SelectionDAG &DAG) {
18617 SDValue In = Op->getOperand(0);
18618 MVT VT = Op->getSimpleValueType(0);
18619 MVT InVT = In.getSimpleValueType();
18620 assert(VT.getSizeInBits() == InVT.getSizeInBits());
18622 MVT SVT = VT.getVectorElementType();
18623 MVT InSVT = InVT.getVectorElementType();
18624 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
18626 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
18628 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
18630 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
18631 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
18632 !(VT.is512BitVector() && Subtarget.hasAVX512()))
18637 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
18638 // For 512-bit vectors, we need 128-bits or 256-bits.
18639 if (VT.getSizeInBits() > 128) {
18640 // Input needs to be at least the same number of elements as output, and
18641 // at least 128-bits.
18642 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
18643 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
18646 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
18647 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
18649 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
18650 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
18651 // need to be handled here for 256/512-bit results.
18652 if (Subtarget.hasInt256()) {
18653 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
18654 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
18655 X86ISD::VSEXT : X86ISD::VZEXT;
18656 return DAG.getNode(ExtOpc, dl, VT, In);
18659 // We should only get here for sign extend.
18660 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
18661 "Unexpected opcode!");
18663 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18667 // As SRAI is only available on i16/i32 types, we expand only up to i32
18668 // and handle i64 separately.
18669 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
18670 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18671 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18672 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18673 Curr = DAG.getBitcast(CurrVT, Curr);
18676 SDValue SignExt = Curr;
18677 if (CurrVT != InVT) {
18678 unsigned SignExtShift =
18679 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18680 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18681 DAG.getConstant(SignExtShift, dl, MVT::i8));
18687 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
18688 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18689 DAG.getConstant(31, dl, MVT::i8));
18690 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18691 return DAG.getBitcast(VT, Ext);
18697 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18698 SelectionDAG &DAG) {
18699 MVT VT = Op->getSimpleValueType(0);
18700 SDValue In = Op->getOperand(0);
18701 MVT InVT = In.getSimpleValueType();
18704 if (InVT.getVectorElementType() == MVT::i1)
18705 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
18707 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
18708 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
18709 (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
18710 (VT != MVT::v8i64 || InVT != MVT::v8i32) &&
18711 (VT != MVT::v8i64 || InVT != MVT::v8i16) &&
18712 (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
18713 (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
18714 (VT != MVT::v32i16 || InVT != MVT::v32i8))
18717 if (Subtarget.hasInt256())
18718 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18720 // Optimize vectors in AVX mode
18721 // Sign extend v8i16 to v8i32 and
18724 // Divide input vector into two parts
18725 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18726 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18727 // concat the vectors to original VT
18729 unsigned NumElems = InVT.getVectorNumElements();
18730 SDValue Undef = DAG.getUNDEF(InVT);
18732 SmallVector<int,8> ShufMask1(NumElems, -1);
18733 for (unsigned i = 0; i != NumElems/2; ++i)
18736 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18738 SmallVector<int,8> ShufMask2(NumElems, -1);
18739 for (unsigned i = 0; i != NumElems/2; ++i)
18740 ShufMask2[i] = i + NumElems/2;
18742 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18744 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18745 VT.getVectorNumElements() / 2);
18747 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18748 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18750 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18753 // Lower truncating store. We need a special lowering to vXi1 vectors
18754 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18755 SelectionDAG &DAG) {
18756 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18758 EVT MemVT = St->getMemoryVT();
18759 assert(St->isTruncatingStore() && "We only custom truncating store.");
18760 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
18761 "Expected truncstore of i1 vector");
18763 SDValue Op = St->getValue();
18764 MVT OpVT = Op.getValueType().getSimpleVT();
18765 unsigned NumElts = OpVT.getVectorNumElements();
18766 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
18768 // Truncate and store - everything is legal
18769 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18770 if (MemVT.getSizeInBits() < 8)
18771 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18772 DAG.getUNDEF(MVT::v8i1), Op,
18773 DAG.getIntPtrConstant(0, dl));
18774 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18775 St->getMemOperand());
18778 // A subset, assume that we have only AVX-512F
18779 if (NumElts <= 8) {
18781 // Extend to 8-elts vector
18782 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18783 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18784 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18786 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18787 Op = DAG.getBitcast(MVT::i8, Op);
18788 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18789 St->getMemOperand());
18792 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18793 // Divide the vector into 2 parts and store each part separately
18794 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18795 DAG.getIntPtrConstant(0, dl));
18796 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18797 SDValue BasePtr = St->getBasePtr();
18798 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18799 St->getMemOperand());
18800 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18801 DAG.getIntPtrConstant(16, dl));
18802 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18804 SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);
18806 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18807 BasePtrHi, St->getPointerInfo().getWithOffset(2),
18808 MinAlign(St->getAlignment(), 2U),
18809 St->getMemOperand()->getFlags());
18810 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18813 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18814 const X86Subtarget &Subtarget,
18815 SelectionDAG &DAG) {
18817 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18819 EVT MemVT = Ld->getMemoryVT();
18820 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18821 "Expected i1 vector load");
18822 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18823 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18824 MVT VT = Op.getValueType().getSimpleVT();
18825 unsigned NumElts = VT.getVectorNumElements();
18827 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18828 (Subtarget.hasDQI() && NumElts < 16) ||
18830 // Load and extend - everything is legal
18832 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18834 Ld->getMemOperand());
18835 // Replace chain users with the new chain.
18836 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18837 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18838 if (Subtarget.hasVLX()) {
18839 // Extract to v4i1/v2i1.
18840 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Load,
18841 DAG.getIntPtrConstant(0, dl));
18842 // Finally, do a normal sign-extend to the desired register.
18843 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
18846 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18847 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18849 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18850 DAG.getIntPtrConstant(0, dl));
18852 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18854 Ld->getMemOperand());
18855 // Replace chain users with the new chain.
18856 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18857 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18859 // Finally, do a normal sign-extend to the desired register.
18860 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18863 if (NumElts <= 8) {
18864 // A subset, assume that we have only AVX-512F
18865 SDValue Load = DAG.getLoad(MVT::i8, dl, Ld->getChain(),
18867 Ld->getMemOperand());
18868 // Replace chain users with the new chain.
18869 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18870 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18872 SDValue BitVec = DAG.getBitcast(MVT::v8i1, Load);
18875 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18877 if (Subtarget.hasVLX()) {
18878 // Extract to v4i1/v2i1.
18879 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, BitVec,
18880 DAG.getIntPtrConstant(0, dl));
18881 // Finally, do a normal sign-extend to the desired register.
18882 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
18885 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18886 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18887 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18888 DAG.getIntPtrConstant(0, dl));
18891 assert(VT == MVT::v32i8 && "Unexpected extload type");
18893 SDValue BasePtr = Ld->getBasePtr();
18894 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18896 Ld->getMemOperand());
18898 SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);
18900 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), BasePtrHi,
18901 Ld->getPointerInfo().getWithOffset(2),
18902 MinAlign(Ld->getAlignment(), 2U),
18903 Ld->getMemOperand()->getFlags());
18905 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18906 LoadLo.getValue(1), LoadHi.getValue(1));
18907 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18909 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18910 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18911 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18914 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18915 // may emit an illegal shuffle but the expansion is still better than scalar
18916 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18917 // we'll emit a shuffle and a arithmetic shift.
18918 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18919 // TODO: It is possible to support ZExt by zeroing the undef values during
18920 // the shuffle phase or after the shuffle.
18921 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18922 SelectionDAG &DAG) {
18923 MVT RegVT = Op.getSimpleValueType();
18924 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18925 assert(RegVT.isInteger() &&
18926 "We only custom lower integer vector sext loads.");
18928 // Nothing useful we can do without SSE2 shuffles.
18929 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18931 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18933 EVT MemVT = Ld->getMemoryVT();
18934 if (MemVT.getScalarType() == MVT::i1)
18935 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18937 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18938 unsigned RegSz = RegVT.getSizeInBits();
18940 ISD::LoadExtType Ext = Ld->getExtensionType();
18942 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18943 && "Only anyext and sext are currently implemented.");
18944 assert(MemVT != RegVT && "Cannot extend to the same type");
18945 assert(MemVT.isVector() && "Must load a vector from memory");
18947 unsigned NumElems = RegVT.getVectorNumElements();
18948 unsigned MemSz = MemVT.getSizeInBits();
18949 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18951 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18952 // The only way in which we have a legal 256-bit vector result but not the
18953 // integer 256-bit operations needed to directly lower a sextload is if we
18954 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18955 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18956 // correctly legalized. We do this late to allow the canonical form of
18957 // sextload to persist throughout the rest of the DAG combiner -- it wants
18958 // to fold together any extensions it can, and so will fuse a sign_extend
18959 // of an sextload into a sextload targeting a wider value.
18961 if (MemSz == 128) {
18962 // Just switch this to a normal load.
18963 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18964 "it must be a legal 128-bit vector "
18966 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18967 Ld->getPointerInfo(), Ld->getAlignment(),
18968 Ld->getMemOperand()->getFlags());
18970 assert(MemSz < 128 &&
18971 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18972 // Do an sext load to a 128-bit vector type. We want to use the same
18973 // number of elements, but elements half as wide. This will end up being
18974 // recursively lowered by this routine, but will succeed as we definitely
18975 // have all the necessary features if we're using AVX1.
18977 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18978 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18980 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18981 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18982 Ld->getMemOperand()->getFlags());
18985 // Replace chain users with the new chain.
18986 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18987 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18989 // Finally, do a normal sign-extend to the desired register.
18990 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18993 // All sizes must be a power of two.
18994 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18995 "Non-power-of-two elements are not custom lowered!");
18997 // Attempt to load the original value using scalar loads.
18998 // Find the largest scalar type that divides the total loaded size.
18999 MVT SclrLoadTy = MVT::i8;
19000 for (MVT Tp : MVT::integer_valuetypes()) {
19001 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
19006 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
19007 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
19009 SclrLoadTy = MVT::f64;
19011 // Calculate the number of scalar loads that we need to perform
19012 // in order to load our vector from memory.
19013 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
19015 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
19016 "Can only lower sext loads with a single scalar load!");
19018 unsigned loadRegZize = RegSz;
19019 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
19022 // If we don't have BWI we won't be able to create the shuffle needed for
19024 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19025 MemVT == MVT::v8i8)
19028 // Represent our vector as a sequence of elements which are the
19029 // largest scalar that we can load.
19030 EVT LoadUnitVecVT = EVT::getVectorVT(
19031 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
19033 // Represent the data using the same element type that is stored in
19034 // memory. In practice, we ''widen'' MemVT.
19036 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
19037 loadRegZize / MemVT.getScalarSizeInBits());
19039 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
19040 "Invalid vector type");
19042 // We can't shuffle using an illegal type.
19043 assert(TLI.isTypeLegal(WideVecVT) &&
19044 "We only lower types that form legal widened vector types");
19046 SmallVector<SDValue, 8> Chains;
19047 SDValue Ptr = Ld->getBasePtr();
19048 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
19049 TLI.getPointerTy(DAG.getDataLayout()));
19050 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
19052 for (unsigned i = 0; i < NumLoads; ++i) {
19053 // Perform a single load.
19054 SDValue ScalarLoad =
19055 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
19056 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
19057 Chains.push_back(ScalarLoad.getValue(1));
19058 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
19059 // another round of DAGCombining.
19061 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
19063 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
19064 ScalarLoad, DAG.getIntPtrConstant(i, dl));
19066 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
19069 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
19071 // Bitcast the loaded value to a vector of the original element type, in
19072 // the size of the target vector type.
19073 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
19074 unsigned SizeRatio = RegSz / MemSz;
19076 if (Ext == ISD::SEXTLOAD) {
19077 // If we have SSE4.1, we can directly emit a VSEXT node.
19078 if (Subtarget.hasSSE41()) {
19079 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
19080 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19084 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
19086 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
19087 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
19089 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
19090 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19094 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19095 MemVT == MVT::v8i8) {
19096 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
19097 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19101 // Redistribute the loaded elements into the different locations.
19102 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
19103 for (unsigned i = 0; i != NumElems; ++i)
19104 ShuffleVec[i * SizeRatio] = i;
19106 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
19107 DAG.getUNDEF(WideVecVT), ShuffleVec);
19109 // Bitcast to the requested type.
19110 Shuff = DAG.getBitcast(RegVT, Shuff);
19111 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19115 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
19116 /// each of which has no other use apart from the AND / OR.
19117 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
19118 Opc = Op.getOpcode();
19119 if (Opc != ISD::OR && Opc != ISD::AND)
19121 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19122 Op.getOperand(0).hasOneUse() &&
19123 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
19124 Op.getOperand(1).hasOneUse());
19127 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
19128 /// SETCC node has a single use.
19129 static bool isXor1OfSetCC(SDValue Op) {
19130 if (Op.getOpcode() != ISD::XOR)
19132 if (isOneConstant(Op.getOperand(1)))
19133 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19134 Op.getOperand(0).hasOneUse();
19138 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
19139 bool addTest = true;
19140 SDValue Chain = Op.getOperand(0);
19141 SDValue Cond = Op.getOperand(1);
19142 SDValue Dest = Op.getOperand(2);
19145 bool Inverted = false;
19147 if (Cond.getOpcode() == ISD::SETCC) {
19148 // Check for setcc([su]{add,sub,mul}o == 0).
19149 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
19150 isNullConstant(Cond.getOperand(1)) &&
19151 Cond.getOperand(0).getResNo() == 1 &&
19152 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
19153 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
19154 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
19155 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
19156 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
19157 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
19159 Cond = Cond.getOperand(0);
19161 if (SDValue NewCond = LowerSETCC(Cond, DAG))
19166 // FIXME: LowerXALUO doesn't handle these!!
19167 else if (Cond.getOpcode() == X86ISD::ADD ||
19168 Cond.getOpcode() == X86ISD::SUB ||
19169 Cond.getOpcode() == X86ISD::SMUL ||
19170 Cond.getOpcode() == X86ISD::UMUL)
19171 Cond = LowerXALUO(Cond, DAG);
19174 // Look pass (and (setcc_carry (cmp ...)), 1).
19175 if (Cond.getOpcode() == ISD::AND &&
19176 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19177 isOneConstant(Cond.getOperand(1)))
19178 Cond = Cond.getOperand(0);
19180 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19181 // setting operand in place of the X86ISD::SETCC.
19182 unsigned CondOpcode = Cond.getOpcode();
19183 if (CondOpcode == X86ISD::SETCC ||
19184 CondOpcode == X86ISD::SETCC_CARRY) {
19185 CC = Cond.getOperand(0);
19187 SDValue Cmp = Cond.getOperand(1);
19188 unsigned Opc = Cmp.getOpcode();
19189 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
19190 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
19194 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
19198 // These can only come from an arithmetic instruction with overflow,
19199 // e.g. SADDO, UADDO.
19200 Cond = Cond.getOperand(1);
19206 CondOpcode = Cond.getOpcode();
19207 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19208 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19209 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19210 Cond.getOperand(0).getValueType() != MVT::i8)) {
19211 SDValue LHS = Cond.getOperand(0);
19212 SDValue RHS = Cond.getOperand(1);
19213 unsigned X86Opcode;
19216 // Keep this in sync with LowerXALUO, otherwise we might create redundant
19217 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
19219 switch (CondOpcode) {
19220 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19222 if (isOneConstant(RHS)) {
19223 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
19226 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19227 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19229 if (isOneConstant(RHS)) {
19230 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
19233 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19234 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19235 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19236 default: llvm_unreachable("unexpected overflowing operator");
19239 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
19240 if (CondOpcode == ISD::UMULO)
19241 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19244 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19246 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
19248 if (CondOpcode == ISD::UMULO)
19249 Cond = X86Op.getValue(2);
19251 Cond = X86Op.getValue(1);
19253 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19257 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
19258 SDValue Cmp = Cond.getOperand(0).getOperand(1);
19259 if (CondOpc == ISD::OR) {
19260 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
19261 // two branches instead of an explicit OR instruction with a
19263 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19264 isX86LogicalCmp(Cmp)) {
19265 CC = Cond.getOperand(0).getOperand(0);
19266 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19267 Chain, Dest, CC, Cmp);
19268 CC = Cond.getOperand(1).getOperand(0);
19272 } else { // ISD::AND
19273 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
19274 // two branches instead of an explicit AND instruction with a
19275 // separate test. However, we only do this if this block doesn't
19276 // have a fall-through edge, because this requires an explicit
19277 // jmp when the condition is false.
19278 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19279 isX86LogicalCmp(Cmp) &&
19280 Op.getNode()->hasOneUse()) {
19281 X86::CondCode CCode =
19282 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19283 CCode = X86::GetOppositeBranchCondition(CCode);
19284 CC = DAG.getConstant(CCode, dl, MVT::i8);
19285 SDNode *User = *Op.getNode()->use_begin();
19286 // Look for an unconditional branch following this conditional branch.
19287 // We need this because we need to reverse the successors in order
19288 // to implement FCMP_OEQ.
19289 if (User->getOpcode() == ISD::BR) {
19290 SDValue FalseBB = User->getOperand(1);
19292 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19293 assert(NewBR == User);
19297 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19298 Chain, Dest, CC, Cmp);
19299 X86::CondCode CCode =
19300 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
19301 CCode = X86::GetOppositeBranchCondition(CCode);
19302 CC = DAG.getConstant(CCode, dl, MVT::i8);
19308 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
19309 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
19310 // It should be transformed during dag combiner except when the condition
19311 // is set by a arithmetics with overflow node.
19312 X86::CondCode CCode =
19313 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19314 CCode = X86::GetOppositeBranchCondition(CCode);
19315 CC = DAG.getConstant(CCode, dl, MVT::i8);
19316 Cond = Cond.getOperand(0).getOperand(1);
19318 } else if (Cond.getOpcode() == ISD::SETCC &&
19319 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
19320 // For FCMP_OEQ, we can emit
19321 // two branches instead of an explicit AND instruction with a
19322 // separate test. However, we only do this if this block doesn't
19323 // have a fall-through edge, because this requires an explicit
19324 // jmp when the condition is false.
19325 if (Op.getNode()->hasOneUse()) {
19326 SDNode *User = *Op.getNode()->use_begin();
19327 // Look for an unconditional branch following this conditional branch.
19328 // We need this because we need to reverse the successors in order
19329 // to implement FCMP_OEQ.
19330 if (User->getOpcode() == ISD::BR) {
19331 SDValue FalseBB = User->getOperand(1);
19333 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19334 assert(NewBR == User);
19338 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19339 Cond.getOperand(0), Cond.getOperand(1));
19340 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19341 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19342 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19343 Chain, Dest, CC, Cmp);
19344 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
19349 } else if (Cond.getOpcode() == ISD::SETCC &&
19350 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
19351 // For FCMP_UNE, we can emit
19352 // two branches instead of an explicit AND instruction with a
19353 // separate test. However, we only do this if this block doesn't
19354 // have a fall-through edge, because this requires an explicit
19355 // jmp when the condition is false.
19356 if (Op.getNode()->hasOneUse()) {
19357 SDNode *User = *Op.getNode()->use_begin();
19358 // Look for an unconditional branch following this conditional branch.
19359 // We need this because we need to reverse the successors in order
19360 // to implement FCMP_UNE.
19361 if (User->getOpcode() == ISD::BR) {
19362 SDValue FalseBB = User->getOperand(1);
19364 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19365 assert(NewBR == User);
19368 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19369 Cond.getOperand(0), Cond.getOperand(1));
19370 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19371 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19372 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19373 Chain, Dest, CC, Cmp);
19374 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
19384 // Look pass the truncate if the high bits are known zero.
19385 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19386 Cond = Cond.getOperand(0);
19388 // We know the result of AND is compared against zero. Try to match
19390 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19391 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
19392 CC = NewSetCC.getOperand(0);
19393 Cond = NewSetCC.getOperand(1);
19400 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
19401 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19402 Cond = EmitTest(Cond, X86Cond, dl, DAG);
19404 Cond = ConvertCmpIfNecessary(Cond, DAG);
19405 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19406 Chain, Dest, CC, Cond);
19409 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
19410 // Calls to _alloca are needed to probe the stack when allocating more than 4k
19411 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
19412 // that the guard pages used by the OS virtual memory manager are allocated in
19413 // correct sequence.
19415 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
19416 SelectionDAG &DAG) const {
19417 MachineFunction &MF = DAG.getMachineFunction();
19418 bool SplitStack = MF.shouldSplitStack();
19419 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
19420 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
19421 SplitStack || EmitStackProbe;
19425 SDNode *Node = Op.getNode();
19426 SDValue Chain = Op.getOperand(0);
19427 SDValue Size = Op.getOperand(1);
19428 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
19429 EVT VT = Node->getValueType(0);
19431 // Chain the dynamic stack allocation so that it doesn't modify the stack
19432 // pointer when other instructions are using the stack.
19433 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19435 bool Is64Bit = Subtarget.is64Bit();
19436 MVT SPTy = getPointerTy(DAG.getDataLayout());
19440 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19441 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
19442 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
19443 " not tell us which reg is the stack pointer!");
19445 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
19446 Chain = SP.getValue(1);
19447 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19448 unsigned StackAlign = TFI.getStackAlignment();
19449 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
19450 if (Align > StackAlign)
19451 Result = DAG.getNode(ISD::AND, dl, VT, Result,
19452 DAG.getConstant(-(uint64_t)Align, dl, VT));
19453 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
19454 } else if (SplitStack) {
19455 MachineRegisterInfo &MRI = MF.getRegInfo();
19458 // The 64 bit implementation of segmented stacks needs to clobber both r10
19459 // r11. This makes it impossible to use it along with nested parameters.
19460 const Function &F = MF.getFunction();
19461 for (const auto &A : F.args()) {
19462 if (A.hasNestAttr())
19463 report_fatal_error("Cannot use segmented stacks with functions that "
19464 "have nested arguments.");
19468 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
19469 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
19470 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
19471 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
19472 DAG.getRegister(Vreg, SPTy));
19474 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19475 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
19476 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
19478 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19479 unsigned SPReg = RegInfo->getStackRegister();
19480 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
19481 Chain = SP.getValue(1);
19484 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
19485 DAG.getConstant(-(uint64_t)Align, dl, VT));
19486 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
19492 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
19493 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
19495 SDValue Ops[2] = {Result, Chain};
19496 return DAG.getMergeValues(Ops, dl);
19499 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
19500 MachineFunction &MF = DAG.getMachineFunction();
19501 auto PtrVT = getPointerTy(MF.getDataLayout());
19502 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19504 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19507 if (!Subtarget.is64Bit() ||
19508 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
19509 // vastart just stores the address of the VarArgsFrameIndex slot into the
19510 // memory location argument.
19511 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19512 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
19513 MachinePointerInfo(SV));
19517 // gp_offset (0 - 6 * 8)
19518 // fp_offset (48 - 48 + 8 * 16)
19519 // overflow_arg_area (point to parameters coming in memory).
19521 SmallVector<SDValue, 8> MemOps;
19522 SDValue FIN = Op.getOperand(1);
19524 SDValue Store = DAG.getStore(
19525 Op.getOperand(0), DL,
19526 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
19527 MachinePointerInfo(SV));
19528 MemOps.push_back(Store);
19531 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
19532 Store = DAG.getStore(
19533 Op.getOperand(0), DL,
19534 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
19535 MachinePointerInfo(SV, 4));
19536 MemOps.push_back(Store);
19538 // Store ptr to overflow_arg_area
19539 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
19540 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19542 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
19543 MemOps.push_back(Store);
19545 // Store ptr to reg_save_area.
19546 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
19547 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
19548 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
19549 Store = DAG.getStore(
19550 Op.getOperand(0), DL, RSFIN, FIN,
19551 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
19552 MemOps.push_back(Store);
19553 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
19556 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
19557 assert(Subtarget.is64Bit() &&
19558 "LowerVAARG only handles 64-bit va_arg!");
19559 assert(Op.getNumOperands() == 4);
19561 MachineFunction &MF = DAG.getMachineFunction();
19562 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
19563 // The Win64 ABI uses char* instead of a structure.
19564 return DAG.expandVAArg(Op.getNode());
19566 SDValue Chain = Op.getOperand(0);
19567 SDValue SrcPtr = Op.getOperand(1);
19568 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19569 unsigned Align = Op.getConstantOperandVal(3);
19572 EVT ArgVT = Op.getNode()->getValueType(0);
19573 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19574 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
19577 // Decide which area this value should be read from.
19578 // TODO: Implement the AMD64 ABI in its entirety. This simple
19579 // selection mechanism works only for the basic types.
19580 if (ArgVT == MVT::f80) {
19581 llvm_unreachable("va_arg for f80 not yet implemented");
19582 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
19583 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
19584 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
19585 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
19587 llvm_unreachable("Unhandled argument type in LowerVAARG");
19590 if (ArgMode == 2) {
19591 // Sanity Check: Make sure using fp_offset makes sense.
19592 assert(!Subtarget.useSoftFloat() &&
19593 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
19594 Subtarget.hasSSE1());
19597 // Insert VAARG_64 node into the DAG
19598 // VAARG_64 returns two values: Variable Argument Address, Chain
19599 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
19600 DAG.getConstant(ArgMode, dl, MVT::i8),
19601 DAG.getConstant(Align, dl, MVT::i32)};
19602 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
19603 SDValue VAARG = DAG.getMemIntrinsicNode(
19604 X86ISD::VAARG_64, dl,
19605 VTs, InstOps, MVT::i64,
19606 MachinePointerInfo(SV),
19608 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
19609 Chain = VAARG.getValue(1);
19611 // Load the next argument and return it
19612 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
19615 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
19616 SelectionDAG &DAG) {
19617 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
19618 // where a va_list is still an i8*.
19619 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
19620 if (Subtarget.isCallingConvWin64(
19621 DAG.getMachineFunction().getFunction().getCallingConv()))
19622 // Probably a Win64 va_copy.
19623 return DAG.expandVACopy(Op.getNode());
19625 SDValue Chain = Op.getOperand(0);
19626 SDValue DstPtr = Op.getOperand(1);
19627 SDValue SrcPtr = Op.getOperand(2);
19628 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
19629 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
19632 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
19633 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
19635 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
19638 /// Handle vector element shifts where the shift amount is a constant.
19639 /// Takes immediate version of shift as input.
19640 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
19641 SDValue SrcOp, uint64_t ShiftAmt,
19642 SelectionDAG &DAG) {
19643 MVT ElementType = VT.getVectorElementType();
19645 // Bitcast the source vector to the output type, this is mainly necessary for
19646 // vXi8/vXi64 shifts.
19647 if (VT != SrcOp.getSimpleValueType())
19648 SrcOp = DAG.getBitcast(VT, SrcOp);
19650 // Fold this packed shift into its first operand if ShiftAmt is 0.
19654 // Check for ShiftAmt >= element width
19655 if (ShiftAmt >= ElementType.getSizeInBits()) {
19656 if (Opc == X86ISD::VSRAI)
19657 ShiftAmt = ElementType.getSizeInBits() - 1;
19659 return DAG.getConstant(0, dl, VT);
19662 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
19663 && "Unknown target vector shift-by-constant node");
19665 // Fold this packed vector shift into a build vector if SrcOp is a
19666 // vector of Constants or UNDEFs.
19667 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
19668 SmallVector<SDValue, 8> Elts;
19669 unsigned NumElts = SrcOp->getNumOperands();
19670 ConstantSDNode *ND;
19673 default: llvm_unreachable("Unknown opcode!");
19674 case X86ISD::VSHLI:
19675 for (unsigned i=0; i!=NumElts; ++i) {
19676 SDValue CurrentOp = SrcOp->getOperand(i);
19677 if (CurrentOp->isUndef()) {
19678 Elts.push_back(CurrentOp);
19681 ND = cast<ConstantSDNode>(CurrentOp);
19682 const APInt &C = ND->getAPIntValue();
19683 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19686 case X86ISD::VSRLI:
19687 for (unsigned i=0; i!=NumElts; ++i) {
19688 SDValue CurrentOp = SrcOp->getOperand(i);
19689 if (CurrentOp->isUndef()) {
19690 Elts.push_back(CurrentOp);
19693 ND = cast<ConstantSDNode>(CurrentOp);
19694 const APInt &C = ND->getAPIntValue();
19695 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19698 case X86ISD::VSRAI:
19699 for (unsigned i=0; i!=NumElts; ++i) {
19700 SDValue CurrentOp = SrcOp->getOperand(i);
19701 if (CurrentOp->isUndef()) {
19702 Elts.push_back(CurrentOp);
19705 ND = cast<ConstantSDNode>(CurrentOp);
19706 const APInt &C = ND->getAPIntValue();
19707 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19712 return DAG.getBuildVector(VT, dl, Elts);
19715 return DAG.getNode(Opc, dl, VT, SrcOp,
19716 DAG.getConstant(ShiftAmt, dl, MVT::i8));
19719 /// Handle vector element shifts where the shift amount may or may not be a
19720 /// constant. Takes immediate version of shift as input.
19721 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19722 SDValue SrcOp, SDValue ShAmt,
19723 const X86Subtarget &Subtarget,
19724 SelectionDAG &DAG) {
19725 MVT SVT = ShAmt.getSimpleValueType();
19726 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
19728 // Catch shift-by-constant.
19729 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19730 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19731 CShAmt->getZExtValue(), DAG);
19733 // Change opcode to non-immediate version
19735 default: llvm_unreachable("Unknown target vector shift node");
19736 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19737 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19738 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19741 // Need to build a vector containing shift amount.
19742 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19743 // +=================+============+=======================================+
19744 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
19745 // +=================+============+=======================================+
19746 // | i64 | Yes, No | Use ShAmt as lowest elt |
19747 // | i32 | Yes | zero-extend in-reg |
19748 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
19749 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
19750 // +=================+============+=======================================+
19752 if (SVT == MVT::i64)
19753 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19754 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
19755 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
19756 ShAmt = ShAmt.getOperand(0);
19757 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19758 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19759 } else if (Subtarget.hasSSE41() &&
19760 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
19761 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19762 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19764 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
19765 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19766 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19769 // The return type has to be a 128-bit type with the same element
19770 // type as the input type.
19771 MVT EltVT = VT.getVectorElementType();
19772 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19774 ShAmt = DAG.getBitcast(ShVT, ShAmt);
19775 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19778 /// \brief Return Mask with the necessary casting or extending
19779 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
19780 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19781 const X86Subtarget &Subtarget, SelectionDAG &DAG,
19784 if (isAllOnesConstant(Mask))
19785 return DAG.getConstant(1, dl, MaskVT);
19786 if (X86::isZeroNode(Mask))
19787 return DAG.getConstant(0, dl, MaskVT);
19789 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19790 // Mask should be extended
19791 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19792 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19795 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19796 if (MaskVT == MVT::v64i1) {
19797 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19798 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19800 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19801 DAG.getConstant(0, dl, MVT::i32));
19802 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19803 DAG.getConstant(1, dl, MVT::i32));
19805 Lo = DAG.getBitcast(MVT::v32i1, Lo);
19806 Hi = DAG.getBitcast(MVT::v32i1, Hi);
19808 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19810 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19812 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19813 return DAG.getBitcast(MaskVT,
19814 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19818 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19819 Mask.getSimpleValueType().getSizeInBits());
19820 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19821 // are extracted by EXTRACT_SUBVECTOR.
19822 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19823 DAG.getBitcast(BitcastVT, Mask),
19824 DAG.getIntPtrConstant(0, dl));
19828 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19829 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19830 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19831 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19832 SDValue PreservedSrc,
19833 const X86Subtarget &Subtarget,
19834 SelectionDAG &DAG) {
19835 MVT VT = Op.getSimpleValueType();
19836 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19837 unsigned OpcodeSelect = ISD::VSELECT;
19840 if (isAllOnesConstant(Mask))
19843 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19845 switch (Op.getOpcode()) {
19848 case X86ISD::CMPM_RND:
19849 case X86ISD::CMPMU:
19850 case X86ISD::VPSHUFBITQMB:
19851 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19852 case X86ISD::VFPCLASS:
19853 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19854 case X86ISD::VTRUNC:
19855 case X86ISD::VTRUNCS:
19856 case X86ISD::VTRUNCUS:
19857 case X86ISD::CVTPS2PH:
19858 // We can't use ISD::VSELECT here because it is not always "Legal"
19859 // for the destination type. For example vpmovqb require only AVX512
19860 // and vselect that can operate on byte element type require BWI
19861 OpcodeSelect = X86ISD::SELECT;
19864 if (PreservedSrc.isUndef())
19865 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19866 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19869 /// \brief Creates an SDNode for a predicated scalar operation.
19870 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19871 /// The mask is coming as MVT::i8 and it should be transformed
19872 /// to MVT::v1i1 while lowering masking intrinsics.
19873 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19874 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19875 /// for a scalar instruction.
19876 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19877 SDValue PreservedSrc,
19878 const X86Subtarget &Subtarget,
19879 SelectionDAG &DAG) {
19881 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19882 if (MaskConst->getZExtValue() & 0x1)
19885 MVT VT = Op.getSimpleValueType();
19888 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19889 if (Op.getOpcode() == X86ISD::FSETCCM ||
19890 Op.getOpcode() == X86ISD::FSETCCM_RND)
19891 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19892 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19893 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19895 if (PreservedSrc.isUndef())
19896 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19897 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19900 static int getSEHRegistrationNodeSize(const Function *Fn) {
19901 if (!Fn->hasPersonalityFn())
19902 report_fatal_error(
19903 "querying registration node size for function without personality");
19904 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19905 // WinEHStatePass for the full struct definition.
19906 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19907 case EHPersonality::MSVC_X86SEH: return 24;
19908 case EHPersonality::MSVC_CXX: return 16;
19911 report_fatal_error(
19912 "can only recover FP for 32-bit MSVC EH personality functions");
19915 /// When the MSVC runtime transfers control to us, either to an outlined
19916 /// function or when returning to a parent frame after catching an exception, we
19917 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19918 /// Here's the math:
19919 /// RegNodeBase = EntryEBP - RegNodeSize
19920 /// ParentFP = RegNodeBase - ParentFrameOffset
19921 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19922 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19923 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19924 SDValue EntryEBP) {
19925 MachineFunction &MF = DAG.getMachineFunction();
19928 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19929 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19931 // It's possible that the parent function no longer has a personality function
19932 // if the exceptional code was optimized away, in which case we just return
19933 // the incoming EBP.
19934 if (!Fn->hasPersonalityFn())
19937 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19938 // registration, or the .set_setframe offset.
19939 MCSymbol *OffsetSym =
19940 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19941 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19942 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19943 SDValue ParentFrameOffset =
19944 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19946 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19947 // prologue to RBP in the parent function.
19948 const X86Subtarget &Subtarget =
19949 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19950 if (Subtarget.is64Bit())
19951 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19953 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19954 // RegNodeBase = EntryEBP - RegNodeSize
19955 // ParentFP = RegNodeBase - ParentFrameOffset
19956 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19957 DAG.getConstant(RegNodeSize, dl, PtrVT));
19958 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19961 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
19962 SelectionDAG &DAG) const {
19963 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19964 auto isRoundModeCurDirection = [](SDValue Rnd) {
19965 if (!isa<ConstantSDNode>(Rnd))
19968 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19969 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19973 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19974 MVT VT = Op.getSimpleValueType();
19975 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19977 switch(IntrData->Type) {
19978 case INTR_TYPE_1OP:
19979 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19980 case INTR_TYPE_2OP:
19981 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19983 case INTR_TYPE_3OP:
19984 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19985 Op.getOperand(2), Op.getOperand(3));
19986 case INTR_TYPE_4OP:
19987 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19988 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19989 case INTR_TYPE_1OP_MASK_RM: {
19990 SDValue Src = Op.getOperand(1);
19991 SDValue PassThru = Op.getOperand(2);
19992 SDValue Mask = Op.getOperand(3);
19993 SDValue RoundingMode;
19994 // We always add rounding mode to the Node.
19995 // If the rounding mode is not specified, we add the
19996 // "current direction" mode.
19997 if (Op.getNumOperands() == 4)
19999 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20001 RoundingMode = Op.getOperand(4);
20002 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
20003 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20005 Mask, PassThru, Subtarget, DAG);
20007 case INTR_TYPE_1OP_MASK: {
20008 SDValue Src = Op.getOperand(1);
20009 SDValue PassThru = Op.getOperand(2);
20010 SDValue Mask = Op.getOperand(3);
20011 // We add rounding mode to the Node when
20012 // - RM Opcode is specified and
20013 // - RM is not "current direction".
20014 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20015 if (IntrWithRoundingModeOpcode != 0) {
20016 SDValue Rnd = Op.getOperand(4);
20017 if (!isRoundModeCurDirection(Rnd)) {
20018 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20019 dl, Op.getValueType(),
20021 Mask, PassThru, Subtarget, DAG);
20024 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
20025 Mask, PassThru, Subtarget, DAG);
20027 case INTR_TYPE_SCALAR_MASK: {
20028 SDValue Src1 = Op.getOperand(1);
20029 SDValue Src2 = Op.getOperand(2);
20030 SDValue passThru = Op.getOperand(3);
20031 SDValue Mask = Op.getOperand(4);
20032 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20033 // There are 2 kinds of intrinsics in this group:
20034 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20035 // (2) With rounding mode and sae - 7 operands.
20036 bool HasRounding = IntrWithRoundingModeOpcode != 0;
20037 if (Op.getNumOperands() == (5U + HasRounding)) {
20039 SDValue Rnd = Op.getOperand(5);
20040 if (!isRoundModeCurDirection(Rnd))
20041 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20042 dl, VT, Src1, Src2, Rnd),
20043 Mask, passThru, Subtarget, DAG);
20045 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20047 Mask, passThru, Subtarget, DAG);
20050 assert(Op.getNumOperands() == (6U + HasRounding) &&
20051 "Unexpected intrinsic form");
20052 SDValue RoundingMode = Op.getOperand(5);
20054 SDValue Sae = Op.getOperand(6);
20055 if (!isRoundModeCurDirection(Sae))
20056 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20057 dl, VT, Src1, Src2,
20058 RoundingMode, Sae),
20059 Mask, passThru, Subtarget, DAG);
20061 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20062 Src2, RoundingMode),
20063 Mask, passThru, Subtarget, DAG);
20065 case INTR_TYPE_SCALAR_MASK_RM: {
20066 SDValue Src1 = Op.getOperand(1);
20067 SDValue Src2 = Op.getOperand(2);
20068 SDValue Src0 = Op.getOperand(3);
20069 SDValue Mask = Op.getOperand(4);
20070 // There are 2 kinds of intrinsics in this group:
20071 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20072 // (2) With rounding mode and sae - 7 operands.
20073 if (Op.getNumOperands() == 6) {
20074 SDValue Sae = Op.getOperand(5);
20075 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20077 Mask, Src0, Subtarget, DAG);
20079 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
20080 SDValue RoundingMode = Op.getOperand(5);
20081 SDValue Sae = Op.getOperand(6);
20082 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20083 RoundingMode, Sae),
20084 Mask, Src0, Subtarget, DAG);
20086 case INTR_TYPE_2OP_MASK:
20087 case INTR_TYPE_2OP_IMM8_MASK: {
20088 SDValue Src1 = Op.getOperand(1);
20089 SDValue Src2 = Op.getOperand(2);
20090 SDValue PassThru = Op.getOperand(3);
20091 SDValue Mask = Op.getOperand(4);
20093 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
20094 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
20096 // We specify 2 possible opcodes for intrinsics with rounding modes.
20097 // First, we check if the intrinsic may have non-default rounding mode,
20098 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20099 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20100 if (IntrWithRoundingModeOpcode != 0) {
20101 SDValue Rnd = Op.getOperand(5);
20102 if (!isRoundModeCurDirection(Rnd)) {
20103 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20104 dl, Op.getValueType(),
20106 Mask, PassThru, Subtarget, DAG);
20109 // TODO: Intrinsics should have fast-math-flags to propagate.
20110 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
20111 Mask, PassThru, Subtarget, DAG);
20113 case INTR_TYPE_2OP_MASK_RM: {
20114 SDValue Src1 = Op.getOperand(1);
20115 SDValue Src2 = Op.getOperand(2);
20116 SDValue PassThru = Op.getOperand(3);
20117 SDValue Mask = Op.getOperand(4);
20118 // We specify 2 possible modes for intrinsics, with/without rounding
20120 // First, we check if the intrinsic have rounding mode (6 operands),
20121 // if not, we set rounding mode to "current".
20123 if (Op.getNumOperands() == 6)
20124 Rnd = Op.getOperand(5);
20126 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20127 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20129 Mask, PassThru, Subtarget, DAG);
20131 case INTR_TYPE_3OP_SCALAR_MASK: {
20132 SDValue Src1 = Op.getOperand(1);
20133 SDValue Src2 = Op.getOperand(2);
20134 SDValue Src3 = Op.getOperand(3);
20135 SDValue PassThru = Op.getOperand(4);
20136 SDValue Mask = Op.getOperand(5);
20138 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20139 if (IntrWithRoundingModeOpcode != 0) {
20140 SDValue Rnd = Op.getOperand(6);
20141 if (!isRoundModeCurDirection(Rnd))
20142 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20143 dl, VT, Src1, Src2, Src3, Rnd),
20144 Mask, PassThru, Subtarget, DAG);
20146 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20148 Mask, PassThru, Subtarget, DAG);
20150 case INTR_TYPE_3OP_MASK_RM: {
20151 SDValue Src1 = Op.getOperand(1);
20152 SDValue Src2 = Op.getOperand(2);
20153 SDValue Imm = Op.getOperand(3);
20154 SDValue PassThru = Op.getOperand(4);
20155 SDValue Mask = Op.getOperand(5);
20156 // We specify 2 possible modes for intrinsics, with/without rounding
20158 // First, we check if the intrinsic have rounding mode (7 operands),
20159 // if not, we set rounding mode to "current".
20161 if (Op.getNumOperands() == 7)
20162 Rnd = Op.getOperand(6);
20164 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20165 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20166 Src1, Src2, Imm, Rnd),
20167 Mask, PassThru, Subtarget, DAG);
20169 case INTR_TYPE_3OP_IMM8_MASK:
20170 case INTR_TYPE_3OP_MASK: {
20171 SDValue Src1 = Op.getOperand(1);
20172 SDValue Src2 = Op.getOperand(2);
20173 SDValue Src3 = Op.getOperand(3);
20174 SDValue PassThru = Op.getOperand(4);
20175 SDValue Mask = Op.getOperand(5);
20177 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
20178 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
20180 // We specify 2 possible opcodes for intrinsics with rounding modes.
20181 // First, we check if the intrinsic may have non-default rounding mode,
20182 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20183 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20184 if (IntrWithRoundingModeOpcode != 0) {
20185 SDValue Rnd = Op.getOperand(6);
20186 if (!isRoundModeCurDirection(Rnd)) {
20187 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20188 dl, Op.getValueType(),
20189 Src1, Src2, Src3, Rnd),
20190 Mask, PassThru, Subtarget, DAG);
20193 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20195 Mask, PassThru, Subtarget, DAG);
20197 case VPERM_2OP_MASK : {
20198 SDValue Src1 = Op.getOperand(1);
20199 SDValue Src2 = Op.getOperand(2);
20200 SDValue PassThru = Op.getOperand(3);
20201 SDValue Mask = Op.getOperand(4);
20203 // Swap Src1 and Src2 in the node creation
20204 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
20205 Mask, PassThru, Subtarget, DAG);
20207 case VPERM_3OP_MASKZ:
20208 case VPERM_3OP_MASK:{
20209 MVT VT = Op.getSimpleValueType();
20210 // Src2 is the PassThru
20211 SDValue Src1 = Op.getOperand(1);
20212 // PassThru needs to be the same type as the destination in order
20213 // to pattern match correctly.
20214 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
20215 SDValue Src3 = Op.getOperand(3);
20216 SDValue Mask = Op.getOperand(4);
20217 SDValue PassThru = SDValue();
20219 // set PassThru element
20220 if (IntrData->Type == VPERM_3OP_MASKZ)
20221 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20225 // Swap Src1 and Src2 in the node creation
20226 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20227 dl, Op.getValueType(),
20229 Mask, PassThru, Subtarget, DAG);
20233 case FMA_OP_MASK: {
20234 SDValue Src1 = Op.getOperand(1);
20235 SDValue Src2 = Op.getOperand(2);
20236 SDValue Src3 = Op.getOperand(3);
20237 SDValue Mask = Op.getOperand(4);
20238 MVT VT = Op.getSimpleValueType();
20239 SDValue PassThru = SDValue();
20241 // set PassThru element
20242 if (IntrData->Type == FMA_OP_MASKZ)
20243 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20244 else if (IntrData->Type == FMA_OP_MASK3)
20249 // We specify 2 possible opcodes for intrinsics with rounding modes.
20250 // First, we check if the intrinsic may have non-default rounding mode,
20251 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20252 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20253 if (IntrWithRoundingModeOpcode != 0) {
20254 SDValue Rnd = Op.getOperand(5);
20255 if (!isRoundModeCurDirection(Rnd))
20256 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20257 dl, Op.getValueType(),
20258 Src1, Src2, Src3, Rnd),
20259 Mask, PassThru, Subtarget, DAG);
20261 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20262 dl, Op.getValueType(),
20264 Mask, PassThru, Subtarget, DAG);
20266 case FMA_OP_SCALAR_MASK:
20267 case FMA_OP_SCALAR_MASK3:
20268 case FMA_OP_SCALAR_MASKZ: {
20269 SDValue Src1 = Op.getOperand(1);
20270 SDValue Src2 = Op.getOperand(2);
20271 SDValue Src3 = Op.getOperand(3);
20272 SDValue Mask = Op.getOperand(4);
20273 MVT VT = Op.getSimpleValueType();
20274 SDValue PassThru = SDValue();
20276 // set PassThru element
20277 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
20278 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20279 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
20284 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20285 if (IntrWithRoundingModeOpcode != 0) {
20286 SDValue Rnd = Op.getOperand(5);
20287 if (!isRoundModeCurDirection(Rnd))
20288 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
20289 Op.getValueType(), Src1, Src2,
20291 Mask, PassThru, Subtarget, DAG);
20294 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
20295 Op.getValueType(), Src1, Src2,
20297 Mask, PassThru, Subtarget, DAG);
20299 case IFMA_OP_MASKZ:
20300 case IFMA_OP_MASK: {
20301 SDValue Src1 = Op.getOperand(1);
20302 SDValue Src2 = Op.getOperand(2);
20303 SDValue Src3 = Op.getOperand(3);
20304 SDValue Mask = Op.getOperand(4);
20305 MVT VT = Op.getSimpleValueType();
20306 SDValue PassThru = Src1;
20308 // set PassThru element
20309 if (IntrData->Type == IFMA_OP_MASKZ)
20310 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20312 // Node we need to swizzle the operands to pass the multiply operands
20314 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20315 dl, Op.getValueType(),
20317 Mask, PassThru, Subtarget, DAG);
20319 case TERLOG_OP_MASK:
20320 case TERLOG_OP_MASKZ: {
20321 SDValue Src1 = Op.getOperand(1);
20322 SDValue Src2 = Op.getOperand(2);
20323 SDValue Src3 = Op.getOperand(3);
20324 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
20325 SDValue Mask = Op.getOperand(5);
20326 MVT VT = Op.getSimpleValueType();
20327 SDValue PassThru = Src1;
20328 // Set PassThru element.
20329 if (IntrData->Type == TERLOG_OP_MASKZ)
20330 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20332 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20333 Src1, Src2, Src3, Src4),
20334 Mask, PassThru, Subtarget, DAG);
20337 // ISD::FP_ROUND has a second argument that indicates if the truncation
20338 // does not change the value. Set it to 0 since it can change.
20339 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
20340 DAG.getIntPtrConstant(0, dl));
20341 case CVTPD2PS_MASK: {
20342 SDValue Src = Op.getOperand(1);
20343 SDValue PassThru = Op.getOperand(2);
20344 SDValue Mask = Op.getOperand(3);
20345 // We add rounding mode to the Node when
20346 // - RM Opcode is specified and
20347 // - RM is not "current direction".
20348 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20349 if (IntrWithRoundingModeOpcode != 0) {
20350 SDValue Rnd = Op.getOperand(4);
20351 if (!isRoundModeCurDirection(Rnd)) {
20352 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20353 dl, Op.getValueType(),
20355 Mask, PassThru, Subtarget, DAG);
20358 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
20359 // ISD::FP_ROUND has a second argument that indicates if the truncation
20360 // does not change the value. Set it to 0 since it can change.
20361 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20362 DAG.getIntPtrConstant(0, dl)),
20363 Mask, PassThru, Subtarget, DAG);
20366 // FPclass intrinsics with mask
20367 SDValue Src1 = Op.getOperand(1);
20368 MVT VT = Src1.getSimpleValueType();
20369 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20370 SDValue Imm = Op.getOperand(2);
20371 SDValue Mask = Op.getOperand(3);
20372 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20373 Mask.getSimpleValueType().getSizeInBits());
20374 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
20375 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
20377 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20378 DAG.getUNDEF(BitcastVT), FPclassMask,
20379 DAG.getIntPtrConstant(0, dl));
20380 return DAG.getBitcast(Op.getValueType(), Res);
20383 SDValue Src1 = Op.getOperand(1);
20384 SDValue Imm = Op.getOperand(2);
20385 SDValue Mask = Op.getOperand(3);
20386 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
20387 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
20389 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
20390 DAG.getIntPtrConstant(0, dl));
20393 case CMP_MASK_CC: {
20394 // Comparison intrinsics with masks.
20395 // Example of transformation:
20396 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
20397 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
20399 // (v8i1 (insert_subvector undef,
20400 // (v2i1 (and (PCMPEQM %a, %b),
20401 // (extract_subvector
20402 // (v8i1 (bitcast %mask)), 0))), 0))))
20403 MVT VT = Op.getOperand(1).getSimpleValueType();
20404 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20405 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
20406 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20407 Mask.getSimpleValueType().getSizeInBits());
20409 if (IntrData->Type == CMP_MASK_CC) {
20410 SDValue CC = Op.getOperand(3);
20411 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
20412 // We specify 2 possible opcodes for intrinsics with rounding modes.
20413 // First, we check if the intrinsic may have non-default rounding mode,
20414 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20415 if (IntrData->Opc1 != 0) {
20416 SDValue Rnd = Op.getOperand(5);
20417 if (!isRoundModeCurDirection(Rnd))
20418 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
20419 Op.getOperand(2), CC, Rnd);
20421 //default rounding mode
20423 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20424 Op.getOperand(2), CC);
20427 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
20428 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20431 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
20433 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20434 DAG.getUNDEF(BitcastVT), CmpMask,
20435 DAG.getIntPtrConstant(0, dl));
20436 return DAG.getBitcast(Op.getValueType(), Res);
20438 case CMP_MASK_SCALAR_CC: {
20439 SDValue Src1 = Op.getOperand(1);
20440 SDValue Src2 = Op.getOperand(2);
20441 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
20442 SDValue Mask = Op.getOperand(4);
20445 if (IntrData->Opc1 != 0) {
20446 SDValue Rnd = Op.getOperand(5);
20447 if (!isRoundModeCurDirection(Rnd))
20448 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
20450 //default rounding mode
20452 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
20454 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
20456 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask,
20457 DAG.getIntPtrConstant(0, dl));
20459 case COMI: { // Comparison intrinsics
20460 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
20461 SDValue LHS = Op.getOperand(1);
20462 SDValue RHS = Op.getOperand(2);
20463 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
20464 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
20467 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
20468 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
20469 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
20470 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
20473 case ISD::SETNE: { // (ZF = 1 or PF = 1)
20474 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
20475 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
20476 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
20479 case ISD::SETGT: // (CF = 0 and ZF = 0)
20480 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
20482 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
20483 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
20486 case ISD::SETGE: // CF = 0
20487 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
20489 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
20490 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
20493 llvm_unreachable("Unexpected illegal condition!");
20495 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20497 case COMI_RM: { // Comparison intrinsics with Sae
20498 SDValue LHS = Op.getOperand(1);
20499 SDValue RHS = Op.getOperand(2);
20500 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
20501 SDValue Sae = Op.getOperand(4);
20504 if (isRoundModeCurDirection(Sae))
20505 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
20506 DAG.getConstant(CondVal, dl, MVT::i8));
20508 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
20509 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
20510 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp,
20511 DAG.getIntPtrConstant(0, dl));
20514 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
20515 Op.getOperand(1), Op.getOperand(2), Subtarget,
20517 case COMPRESS_EXPAND_IN_REG: {
20518 SDValue Mask = Op.getOperand(3);
20519 SDValue DataToCompress = Op.getOperand(1);
20520 SDValue PassThru = Op.getOperand(2);
20521 if (isAllOnesConstant(Mask)) // return data as is
20522 return Op.getOperand(1);
20524 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20526 Mask, PassThru, Subtarget, DAG);
20529 SDValue Mask = Op.getOperand(1);
20530 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20531 Mask.getSimpleValueType().getSizeInBits());
20532 Mask = DAG.getBitcast(MaskVT, Mask);
20533 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
20536 MVT VT = Op.getSimpleValueType();
20537 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
20539 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
20540 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
20541 // Arguments should be swapped.
20542 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
20543 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
20545 return DAG.getBitcast(VT, Res);
20548 MVT VT = Op.getSimpleValueType();
20549 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
20551 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
20552 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
20553 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
20554 return DAG.getBitcast(VT, Res);
20557 case FIXUPIMMS_MASKZ:
20559 case FIXUPIMM_MASKZ:{
20560 SDValue Src1 = Op.getOperand(1);
20561 SDValue Src2 = Op.getOperand(2);
20562 SDValue Src3 = Op.getOperand(3);
20563 SDValue Imm = Op.getOperand(4);
20564 SDValue Mask = Op.getOperand(5);
20565 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
20566 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
20567 // We specify 2 possible modes for intrinsics, with/without rounding
20569 // First, we check if the intrinsic have rounding mode (7 operands),
20570 // if not, we set rounding mode to "current".
20572 if (Op.getNumOperands() == 7)
20573 Rnd = Op.getOperand(6);
20575 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20576 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
20577 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20578 Src1, Src2, Src3, Imm, Rnd),
20579 Mask, Passthru, Subtarget, DAG);
20580 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
20581 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20582 Src1, Src2, Src3, Imm, Rnd),
20583 Mask, Passthru, Subtarget, DAG);
20585 case CONVERT_TO_MASK: {
20586 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
20587 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
20588 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
20590 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
20592 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20593 DAG.getUNDEF(BitcastVT), CvtMask,
20594 DAG.getIntPtrConstant(0, dl));
20595 return DAG.getBitcast(Op.getValueType(), Res);
20598 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
20599 // Clear the upper bits of the rounding immediate so that the legacy
20600 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20601 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20603 DAG.getConstant(0xf, dl, MVT::i32));
20604 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20605 Op.getOperand(1), RoundingMode);
20608 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
20609 // Clear the upper bits of the rounding immediate so that the legacy
20610 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20611 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20613 DAG.getConstant(0xf, dl, MVT::i32));
20614 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20615 Op.getOperand(1), Op.getOperand(2), RoundingMode);
20623 default: return SDValue(); // Don't custom lower most intrinsics.
20625 case Intrinsic::x86_avx2_permd:
20626 case Intrinsic::x86_avx2_permps:
20627 // Operands intentionally swapped. Mask is last operand to intrinsic,
20628 // but second operand for node/instruction.
20629 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
20630 Op.getOperand(2), Op.getOperand(1));
20632 // ptest and testp intrinsics. The intrinsic these come from are designed to
20633 // return an integer value, not just an instruction so lower it to the ptest
20634 // or testp pattern and a setcc for the result.
20635 case Intrinsic::x86_sse41_ptestz:
20636 case Intrinsic::x86_sse41_ptestc:
20637 case Intrinsic::x86_sse41_ptestnzc:
20638 case Intrinsic::x86_avx_ptestz_256:
20639 case Intrinsic::x86_avx_ptestc_256:
20640 case Intrinsic::x86_avx_ptestnzc_256:
20641 case Intrinsic::x86_avx_vtestz_ps:
20642 case Intrinsic::x86_avx_vtestc_ps:
20643 case Intrinsic::x86_avx_vtestnzc_ps:
20644 case Intrinsic::x86_avx_vtestz_pd:
20645 case Intrinsic::x86_avx_vtestc_pd:
20646 case Intrinsic::x86_avx_vtestnzc_pd:
20647 case Intrinsic::x86_avx_vtestz_ps_256:
20648 case Intrinsic::x86_avx_vtestc_ps_256:
20649 case Intrinsic::x86_avx_vtestnzc_ps_256:
20650 case Intrinsic::x86_avx_vtestz_pd_256:
20651 case Intrinsic::x86_avx_vtestc_pd_256:
20652 case Intrinsic::x86_avx_vtestnzc_pd_256: {
20653 bool IsTestPacked = false;
20654 X86::CondCode X86CC;
20656 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
20657 case Intrinsic::x86_avx_vtestz_ps:
20658 case Intrinsic::x86_avx_vtestz_pd:
20659 case Intrinsic::x86_avx_vtestz_ps_256:
20660 case Intrinsic::x86_avx_vtestz_pd_256:
20661 IsTestPacked = true;
20663 case Intrinsic::x86_sse41_ptestz:
20664 case Intrinsic::x86_avx_ptestz_256:
20666 X86CC = X86::COND_E;
20668 case Intrinsic::x86_avx_vtestc_ps:
20669 case Intrinsic::x86_avx_vtestc_pd:
20670 case Intrinsic::x86_avx_vtestc_ps_256:
20671 case Intrinsic::x86_avx_vtestc_pd_256:
20672 IsTestPacked = true;
20674 case Intrinsic::x86_sse41_ptestc:
20675 case Intrinsic::x86_avx_ptestc_256:
20677 X86CC = X86::COND_B;
20679 case Intrinsic::x86_avx_vtestnzc_ps:
20680 case Intrinsic::x86_avx_vtestnzc_pd:
20681 case Intrinsic::x86_avx_vtestnzc_ps_256:
20682 case Intrinsic::x86_avx_vtestnzc_pd_256:
20683 IsTestPacked = true;
20685 case Intrinsic::x86_sse41_ptestnzc:
20686 case Intrinsic::x86_avx_ptestnzc_256:
20688 X86CC = X86::COND_A;
20692 SDValue LHS = Op.getOperand(1);
20693 SDValue RHS = Op.getOperand(2);
20694 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
20695 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
20696 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20697 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20699 case Intrinsic::x86_avx512_kortestz_w:
20700 case Intrinsic::x86_avx512_kortestc_w: {
20701 X86::CondCode X86CC =
20702 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
20703 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20704 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20705 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
20706 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20707 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20710 case Intrinsic::x86_avx512_knot_w: {
20711 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20712 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
20713 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20714 return DAG.getBitcast(MVT::i16, Res);
20717 case Intrinsic::x86_avx512_kandn_w: {
20718 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20719 // Invert LHS for the not.
20720 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
20721 DAG.getConstant(1, dl, MVT::v16i1));
20722 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20723 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
20724 return DAG.getBitcast(MVT::i16, Res);
20727 case Intrinsic::x86_avx512_kxnor_w: {
20728 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20729 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20730 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20731 // Invert result for the not.
20732 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
20733 DAG.getConstant(1, dl, MVT::v16i1));
20734 return DAG.getBitcast(MVT::i16, Res);
20737 case Intrinsic::x86_sse42_pcmpistria128:
20738 case Intrinsic::x86_sse42_pcmpestria128:
20739 case Intrinsic::x86_sse42_pcmpistric128:
20740 case Intrinsic::x86_sse42_pcmpestric128:
20741 case Intrinsic::x86_sse42_pcmpistrio128:
20742 case Intrinsic::x86_sse42_pcmpestrio128:
20743 case Intrinsic::x86_sse42_pcmpistris128:
20744 case Intrinsic::x86_sse42_pcmpestris128:
20745 case Intrinsic::x86_sse42_pcmpistriz128:
20746 case Intrinsic::x86_sse42_pcmpestriz128: {
20748 X86::CondCode X86CC;
20750 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
20751 case Intrinsic::x86_sse42_pcmpistria128:
20752 Opcode = X86ISD::PCMPISTRI;
20753 X86CC = X86::COND_A;
20755 case Intrinsic::x86_sse42_pcmpestria128:
20756 Opcode = X86ISD::PCMPESTRI;
20757 X86CC = X86::COND_A;
20759 case Intrinsic::x86_sse42_pcmpistric128:
20760 Opcode = X86ISD::PCMPISTRI;
20761 X86CC = X86::COND_B;
20763 case Intrinsic::x86_sse42_pcmpestric128:
20764 Opcode = X86ISD::PCMPESTRI;
20765 X86CC = X86::COND_B;
20767 case Intrinsic::x86_sse42_pcmpistrio128:
20768 Opcode = X86ISD::PCMPISTRI;
20769 X86CC = X86::COND_O;
20771 case Intrinsic::x86_sse42_pcmpestrio128:
20772 Opcode = X86ISD::PCMPESTRI;
20773 X86CC = X86::COND_O;
20775 case Intrinsic::x86_sse42_pcmpistris128:
20776 Opcode = X86ISD::PCMPISTRI;
20777 X86CC = X86::COND_S;
20779 case Intrinsic::x86_sse42_pcmpestris128:
20780 Opcode = X86ISD::PCMPESTRI;
20781 X86CC = X86::COND_S;
20783 case Intrinsic::x86_sse42_pcmpistriz128:
20784 Opcode = X86ISD::PCMPISTRI;
20785 X86CC = X86::COND_E;
20787 case Intrinsic::x86_sse42_pcmpestriz128:
20788 Opcode = X86ISD::PCMPESTRI;
20789 X86CC = X86::COND_E;
20792 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20793 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20794 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20795 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20796 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20799 case Intrinsic::x86_sse42_pcmpistri128:
20800 case Intrinsic::x86_sse42_pcmpestri128: {
20802 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20803 Opcode = X86ISD::PCMPISTRI;
20805 Opcode = X86ISD::PCMPESTRI;
20807 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20808 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20809 return DAG.getNode(Opcode, dl, VTs, NewOps);
20812 case Intrinsic::eh_sjlj_lsda: {
20813 MachineFunction &MF = DAG.getMachineFunction();
20814 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20815 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20816 auto &Context = MF.getMMI().getContext();
20817 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20818 Twine(MF.getFunctionNumber()));
20819 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
20820 DAG.getMCSymbol(S, PtrVT));
20823 case Intrinsic::x86_seh_lsda: {
20824 // Compute the symbol for the LSDA. We know it'll get emitted later.
20825 MachineFunction &MF = DAG.getMachineFunction();
20826 SDValue Op1 = Op.getOperand(1);
20827 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20828 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20829 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20831 // Generate a simple absolute symbol reference. This intrinsic is only
20832 // supported on 32-bit Windows, which isn't PIC.
20833 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20834 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20837 case Intrinsic::x86_seh_recoverfp: {
20838 SDValue FnOp = Op.getOperand(1);
20839 SDValue IncomingFPOp = Op.getOperand(2);
20840 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20841 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20843 report_fatal_error(
20844 "llvm.x86.seh.recoverfp must take a function as the first argument");
20845 return recoverFramePointer(DAG, Fn, IncomingFPOp);
20848 case Intrinsic::localaddress: {
20849 // Returns one of the stack, base, or frame pointer registers, depending on
20850 // which is used to reference local variables.
20851 MachineFunction &MF = DAG.getMachineFunction();
20852 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20854 if (RegInfo->hasBasePointer(MF))
20855 Reg = RegInfo->getBaseRegister();
20856 else // This function handles the SP or FP case.
20857 Reg = RegInfo->getPtrSizedFrameRegister(MF);
20858 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20863 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20864 SDValue Src, SDValue Mask, SDValue Base,
20865 SDValue Index, SDValue ScaleOp, SDValue Chain,
20866 const X86Subtarget &Subtarget) {
20868 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20869 // Scale must be constant.
20872 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20873 EVT MaskVT = Mask.getValueType();
20874 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20875 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20876 SDValue Segment = DAG.getRegister(0, MVT::i32);
20877 // If source is undef or we know it won't be used, use a zero vector
20878 // to break register dependency.
20879 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20880 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20881 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20882 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20883 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20884 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20885 return DAG.getMergeValues(RetOps, dl);
20888 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20889 SDValue Src, SDValue Mask, SDValue Base,
20890 SDValue Index, SDValue ScaleOp, SDValue Chain,
20891 const X86Subtarget &Subtarget) {
20893 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20894 // Scale must be constant.
20897 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20898 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20899 Index.getSimpleValueType().getVectorNumElements());
20901 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20902 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20903 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20904 SDValue Segment = DAG.getRegister(0, MVT::i32);
20905 // If source is undef or we know it won't be used, use a zero vector
20906 // to break register dependency.
20907 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20908 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20909 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20910 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20911 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20912 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20913 return DAG.getMergeValues(RetOps, dl);
20916 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20917 SDValue Src, SDValue Mask, SDValue Base,
20918 SDValue Index, SDValue ScaleOp, SDValue Chain,
20919 const X86Subtarget &Subtarget) {
20921 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20922 // Scale must be constant.
20925 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20926 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20927 SDValue Segment = DAG.getRegister(0, MVT::i32);
20928 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20929 Index.getSimpleValueType().getVectorNumElements());
20931 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20932 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20933 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20934 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20935 return SDValue(Res, 1);
20938 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20939 SDValue Mask, SDValue Base, SDValue Index,
20940 SDValue ScaleOp, SDValue Chain,
20941 const X86Subtarget &Subtarget) {
20943 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20944 // Scale must be constant.
20947 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20948 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20949 SDValue Segment = DAG.getRegister(0, MVT::i32);
20951 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20952 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20953 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20954 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20955 return SDValue(Res, 0);
20958 /// Handles the lowering of builtin intrinsic that return the value
20959 /// of the extended control register.
20960 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20962 const X86Subtarget &Subtarget,
20963 SmallVectorImpl<SDValue> &Results) {
20964 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20965 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20968 // The ECX register is used to select the index of the XCR register to
20971 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20972 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20973 Chain = SDValue(N1, 0);
20975 // Reads the content of XCR and returns it in registers EDX:EAX.
20976 if (Subtarget.is64Bit()) {
20977 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20978 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20981 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20982 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20985 Chain = HI.getValue(1);
20987 if (Subtarget.is64Bit()) {
20988 // Merge the two 32-bit values into a 64-bit one..
20989 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20990 DAG.getConstant(32, DL, MVT::i8));
20991 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20992 Results.push_back(Chain);
20996 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20997 SDValue Ops[] = { LO, HI };
20998 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20999 Results.push_back(Pair);
21000 Results.push_back(Chain);
21003 /// Handles the lowering of builtin intrinsics that read performance monitor
21004 /// counters (x86_rdpmc).
21005 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
21007 const X86Subtarget &Subtarget,
21008 SmallVectorImpl<SDValue> &Results) {
21009 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21010 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21013 // The ECX register is used to select the index of the performance counter
21015 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
21017 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
21019 // Reads the content of a 64-bit performance counter and returns it in the
21020 // registers EDX:EAX.
21021 if (Subtarget.is64Bit()) {
21022 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21023 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21026 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21027 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21030 Chain = HI.getValue(1);
21032 if (Subtarget.is64Bit()) {
21033 // The EAX register is loaded with the low-order 32 bits. The EDX register
21034 // is loaded with the supported high-order bits of the counter.
21035 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21036 DAG.getConstant(32, DL, MVT::i8));
21037 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21038 Results.push_back(Chain);
21042 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21043 SDValue Ops[] = { LO, HI };
21044 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21045 Results.push_back(Pair);
21046 Results.push_back(Chain);
21049 /// Handles the lowering of builtin intrinsics that read the time stamp counter
21050 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
21051 /// READCYCLECOUNTER nodes.
21052 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
21054 const X86Subtarget &Subtarget,
21055 SmallVectorImpl<SDValue> &Results) {
21056 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21057 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
21060 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
21061 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
21062 // and the EAX register is loaded with the low-order 32 bits.
21063 if (Subtarget.is64Bit()) {
21064 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21065 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21068 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21069 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21072 SDValue Chain = HI.getValue(1);
21074 if (Opcode == X86ISD::RDTSCP_DAG) {
21075 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21077 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
21078 // the ECX register. Add 'ecx' explicitly to the chain.
21079 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
21081 // Explicitly store the content of ECX at the location passed in input
21082 // to the 'rdtscp' intrinsic.
21083 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
21084 MachinePointerInfo());
21087 if (Subtarget.is64Bit()) {
21088 // The EDX register is loaded with the high-order 32 bits of the MSR, and
21089 // the EAX register is loaded with the low-order 32 bits.
21090 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21091 DAG.getConstant(32, DL, MVT::i8));
21092 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21093 Results.push_back(Chain);
21097 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21098 SDValue Ops[] = { LO, HI };
21099 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21100 Results.push_back(Pair);
21101 Results.push_back(Chain);
21104 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
21105 SelectionDAG &DAG) {
21106 SmallVector<SDValue, 2> Results;
21108 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
21110 return DAG.getMergeValues(Results, DL);
21113 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
21114 MachineFunction &MF = DAG.getMachineFunction();
21115 SDValue Chain = Op.getOperand(0);
21116 SDValue RegNode = Op.getOperand(2);
21117 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21119 report_fatal_error("EH registrations only live in functions using WinEH");
21121 // Cast the operand to an alloca, and remember the frame index.
21122 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
21124 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
21125 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
21127 // Return the chain operand without making any DAG nodes.
21131 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
21132 MachineFunction &MF = DAG.getMachineFunction();
21133 SDValue Chain = Op.getOperand(0);
21134 SDValue EHGuard = Op.getOperand(2);
21135 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21137 report_fatal_error("EHGuard only live in functions using WinEH");
21139 // Cast the operand to an alloca, and remember the frame index.
21140 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
21142 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
21143 EHInfo->EHGuardFrameIndex = FINode->getIndex();
21145 // Return the chain operand without making any DAG nodes.
21149 /// Emit Truncating Store with signed or unsigned saturation.
21151 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
21152 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
21153 SelectionDAG &DAG) {
21155 SDVTList VTs = DAG.getVTList(MVT::Other);
21156 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
21157 SDValue Ops[] = { Chain, Val, Ptr, Undef };
21159 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21160 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21163 /// Emit Masked Truncating Store with signed or unsigned saturation.
21165 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
21166 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
21167 MachineMemOperand *MMO, SelectionDAG &DAG) {
21169 SDVTList VTs = DAG.getVTList(MVT::Other);
21170 SDValue Ops[] = { Chain, Ptr, Mask, Val };
21172 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21173 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21176 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
21177 SelectionDAG &DAG) {
21178 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
21180 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
21183 case llvm::Intrinsic::x86_seh_ehregnode:
21184 return MarkEHRegistrationNode(Op, DAG);
21185 case llvm::Intrinsic::x86_seh_ehguard:
21186 return MarkEHGuard(Op, DAG);
21187 case llvm::Intrinsic::x86_flags_read_u32:
21188 case llvm::Intrinsic::x86_flags_read_u64:
21189 case llvm::Intrinsic::x86_flags_write_u32:
21190 case llvm::Intrinsic::x86_flags_write_u64: {
21191 // We need a frame pointer because this will get lowered to a PUSH/POP
21193 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21194 MFI.setHasCopyImplyingStackAdjustment(true);
21195 // Don't do anything here, we will expand these intrinsics out later
21196 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
21199 case Intrinsic::x86_lwpins32:
21200 case Intrinsic::x86_lwpins64: {
21202 SDValue Chain = Op->getOperand(0);
21203 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
21205 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
21206 Op->getOperand(3), Op->getOperand(4));
21207 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
21208 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
21209 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
21210 LwpIns.getValue(1));
21217 switch(IntrData->Type) {
21218 default: llvm_unreachable("Unknown Intrinsic Type");
21221 // Emit the node with the right value type.
21222 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
21223 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21225 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
21226 // Otherwise return the value from Rand, which is always 0, casted to i32.
21227 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
21228 DAG.getConstant(1, dl, Op->getValueType(1)),
21229 DAG.getConstant(X86::COND_B, dl, MVT::i8),
21230 SDValue(Result.getNode(), 1) };
21231 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
21233 // Return { result, isValid, chain }.
21234 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
21235 SDValue(Result.getNode(), 2));
21237 case GATHER_AVX2: {
21238 SDValue Chain = Op.getOperand(0);
21239 SDValue Src = Op.getOperand(2);
21240 SDValue Base = Op.getOperand(3);
21241 SDValue Index = Op.getOperand(4);
21242 SDValue Mask = Op.getOperand(5);
21243 SDValue Scale = Op.getOperand(6);
21244 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21245 Scale, Chain, Subtarget);
21248 //gather(v1, mask, index, base, scale);
21249 SDValue Chain = Op.getOperand(0);
21250 SDValue Src = Op.getOperand(2);
21251 SDValue Base = Op.getOperand(3);
21252 SDValue Index = Op.getOperand(4);
21253 SDValue Mask = Op.getOperand(5);
21254 SDValue Scale = Op.getOperand(6);
21255 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
21259 //scatter(base, mask, index, v1, scale);
21260 SDValue Chain = Op.getOperand(0);
21261 SDValue Base = Op.getOperand(2);
21262 SDValue Mask = Op.getOperand(3);
21263 SDValue Index = Op.getOperand(4);
21264 SDValue Src = Op.getOperand(5);
21265 SDValue Scale = Op.getOperand(6);
21266 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21267 Scale, Chain, Subtarget);
21270 SDValue Hint = Op.getOperand(6);
21271 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
21272 assert((HintVal == 2 || HintVal == 3) &&
21273 "Wrong prefetch hint in intrinsic: should be 2 or 3");
21274 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
21275 SDValue Chain = Op.getOperand(0);
21276 SDValue Mask = Op.getOperand(2);
21277 SDValue Index = Op.getOperand(3);
21278 SDValue Base = Op.getOperand(4);
21279 SDValue Scale = Op.getOperand(5);
21280 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
21283 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
21285 SmallVector<SDValue, 2> Results;
21286 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
21288 return DAG.getMergeValues(Results, dl);
21290 // Read Performance Monitoring Counters.
21292 SmallVector<SDValue, 2> Results;
21293 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
21294 return DAG.getMergeValues(Results, dl);
21296 // Get Extended Control Register.
21298 SmallVector<SDValue, 2> Results;
21299 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
21300 return DAG.getMergeValues(Results, dl);
21302 // XTEST intrinsics.
21304 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
21305 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21307 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
21308 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
21309 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
21310 Ret, SDValue(InTrans.getNode(), 1));
21314 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
21315 SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
21316 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
21317 DAG.getConstant(-1, dl, MVT::i8));
21318 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
21319 Op.getOperand(4), GenCF.getValue(1));
21320 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
21321 Op.getOperand(5), MachinePointerInfo());
21322 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
21323 SDValue Results[] = { SetCC, Store };
21324 return DAG.getMergeValues(Results, dl);
21326 case COMPRESS_TO_MEM: {
21327 SDValue Mask = Op.getOperand(4);
21328 SDValue DataToCompress = Op.getOperand(3);
21329 SDValue Addr = Op.getOperand(2);
21330 SDValue Chain = Op.getOperand(0);
21331 MVT VT = DataToCompress.getSimpleValueType();
21333 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21334 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21336 if (isAllOnesConstant(Mask)) // return just a store
21337 return DAG.getStore(Chain, dl, DataToCompress, Addr,
21338 MemIntr->getMemOperand());
21340 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21341 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21343 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
21344 MemIntr->getMemOperand(),
21345 false /* truncating */, true /* compressing */);
21347 case TRUNCATE_TO_MEM_VI8:
21348 case TRUNCATE_TO_MEM_VI16:
21349 case TRUNCATE_TO_MEM_VI32: {
21350 SDValue Mask = Op.getOperand(4);
21351 SDValue DataToTruncate = Op.getOperand(3);
21352 SDValue Addr = Op.getOperand(2);
21353 SDValue Chain = Op.getOperand(0);
21355 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21356 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21358 EVT MemVT = MemIntr->getMemoryVT();
21360 uint16_t TruncationOp = IntrData->Opc0;
21361 switch (TruncationOp) {
21362 case X86ISD::VTRUNC: {
21363 if (isAllOnesConstant(Mask)) // return just a truncate store
21364 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
21365 MemIntr->getMemOperand());
21367 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21368 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21370 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
21371 MemIntr->getMemOperand(), true /* truncating */);
21373 case X86ISD::VTRUNCUS:
21374 case X86ISD::VTRUNCS: {
21375 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
21376 if (isAllOnesConstant(Mask))
21377 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
21378 MemIntr->getMemOperand(), DAG);
21380 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21381 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21383 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
21384 VMask, MemVT, MemIntr->getMemOperand(), DAG);
21387 llvm_unreachable("Unsupported truncstore intrinsic");
21391 case EXPAND_FROM_MEM: {
21392 SDValue Mask = Op.getOperand(4);
21393 SDValue PassThru = Op.getOperand(3);
21394 SDValue Addr = Op.getOperand(2);
21395 SDValue Chain = Op.getOperand(0);
21396 MVT VT = Op.getSimpleValueType();
21398 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21399 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21401 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
21402 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
21403 if (X86::isZeroNode(Mask))
21404 return DAG.getUNDEF(VT);
21406 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21407 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21408 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
21409 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
21410 true /* expanding */);
21415 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
21416 SelectionDAG &DAG) const {
21417 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21418 MFI.setReturnAddressIsTaken(true);
21420 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
21423 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21425 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21428 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
21429 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21430 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
21431 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
21432 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
21433 MachinePointerInfo());
21436 // Just load the return address.
21437 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
21438 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
21439 MachinePointerInfo());
21442 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
21443 SelectionDAG &DAG) const {
21444 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
21445 return getReturnAddressFrameIndex(DAG);
21448 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
21449 MachineFunction &MF = DAG.getMachineFunction();
21450 MachineFrameInfo &MFI = MF.getFrameInfo();
21451 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21452 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21453 EVT VT = Op.getValueType();
21455 MFI.setFrameAddressIsTaken(true);
21457 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
21458 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
21459 // is not possible to crawl up the stack without looking at the unwind codes
21461 int FrameAddrIndex = FuncInfo->getFAIndex();
21462 if (!FrameAddrIndex) {
21463 // Set up a frame object for the return address.
21464 unsigned SlotSize = RegInfo->getSlotSize();
21465 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
21466 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
21467 FuncInfo->setFAIndex(FrameAddrIndex);
21469 return DAG.getFrameIndex(FrameAddrIndex, VT);
21472 unsigned FrameReg =
21473 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21474 SDLoc dl(Op); // FIXME probably not meaningful
21475 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21476 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
21477 (FrameReg == X86::EBP && VT == MVT::i32)) &&
21478 "Invalid Frame Register!");
21479 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
21481 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
21482 MachinePointerInfo());
21486 // FIXME? Maybe this could be a TableGen attribute on some registers and
21487 // this table could be generated automatically from RegInfo.
21488 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
21489 SelectionDAG &DAG) const {
21490 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21491 const MachineFunction &MF = DAG.getMachineFunction();
21493 unsigned Reg = StringSwitch<unsigned>(RegName)
21494 .Case("esp", X86::ESP)
21495 .Case("rsp", X86::RSP)
21496 .Case("ebp", X86::EBP)
21497 .Case("rbp", X86::RBP)
21500 if (Reg == X86::EBP || Reg == X86::RBP) {
21501 if (!TFI.hasFP(MF))
21502 report_fatal_error("register " + StringRef(RegName) +
21503 " is allocatable: function has no frame pointer");
21506 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21507 unsigned FrameReg =
21508 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21509 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
21510 "Invalid Frame Register!");
21518 report_fatal_error("Invalid register name global variable");
21521 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
21522 SelectionDAG &DAG) const {
21523 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21524 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
21527 unsigned X86TargetLowering::getExceptionPointerRegister(
21528 const Constant *PersonalityFn) const {
21529 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
21530 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21532 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
21535 unsigned X86TargetLowering::getExceptionSelectorRegister(
21536 const Constant *PersonalityFn) const {
21537 // Funclet personalities don't use selectors (the runtime does the selection).
21538 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
21539 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21542 bool X86TargetLowering::needsFixedCatchObjects() const {
21543 return Subtarget.isTargetWin64();
21546 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
21547 SDValue Chain = Op.getOperand(0);
21548 SDValue Offset = Op.getOperand(1);
21549 SDValue Handler = Op.getOperand(2);
21552 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21553 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21554 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
21555 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
21556 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
21557 "Invalid Frame Register!");
21558 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
21559 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
21561 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
21562 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
21564 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
21565 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
21566 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
21568 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
21569 DAG.getRegister(StoreAddrReg, PtrVT));
21572 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
21573 SelectionDAG &DAG) const {
21575 // If the subtarget is not 64bit, we may need the global base reg
21576 // after isel expand pseudo, i.e., after CGBR pass ran.
21577 // Therefore, ask for the GlobalBaseReg now, so that the pass
21578 // inserts the code for us in case we need it.
21579 // Otherwise, we will end up in a situation where we will
21580 // reference a virtual register that is not defined!
21581 if (!Subtarget.is64Bit()) {
21582 const X86InstrInfo *TII = Subtarget.getInstrInfo();
21583 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
21585 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
21586 DAG.getVTList(MVT::i32, MVT::Other),
21587 Op.getOperand(0), Op.getOperand(1));
21590 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
21591 SelectionDAG &DAG) const {
21593 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
21594 Op.getOperand(0), Op.getOperand(1));
21597 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
21598 SelectionDAG &DAG) const {
21600 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
21604 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
21605 return Op.getOperand(0);
21608 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
21609 SelectionDAG &DAG) const {
21610 SDValue Root = Op.getOperand(0);
21611 SDValue Trmp = Op.getOperand(1); // trampoline
21612 SDValue FPtr = Op.getOperand(2); // nested function
21613 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
21616 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21617 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
21619 if (Subtarget.is64Bit()) {
21620 SDValue OutChains[6];
21622 // Large code-model.
21623 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
21624 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
21626 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
21627 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21629 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21631 // Load the pointer to the nested function into R11.
21632 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21633 SDValue Addr = Trmp;
21634 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21635 Addr, MachinePointerInfo(TrmpAddr));
21637 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21638 DAG.getConstant(2, dl, MVT::i64));
21640 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21641 /* Alignment = */ 2);
21643 // Load the 'nest' parameter value into R10.
21644 // R10 is specified in X86CallingConv.td
21645 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21646 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21647 DAG.getConstant(10, dl, MVT::i64));
21648 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21649 Addr, MachinePointerInfo(TrmpAddr, 10));
21651 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21652 DAG.getConstant(12, dl, MVT::i64));
21654 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21655 /* Alignment = */ 2);
21657 // Jump to the nested function.
21658 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21659 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21660 DAG.getConstant(20, dl, MVT::i64));
21661 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21662 Addr, MachinePointerInfo(TrmpAddr, 20));
21664 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21665 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21666 DAG.getConstant(22, dl, MVT::i64));
21667 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21668 Addr, MachinePointerInfo(TrmpAddr, 22));
21670 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21672 const Function *Func =
21673 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21674 CallingConv::ID CC = Func->getCallingConv();
21679 llvm_unreachable("Unsupported calling convention");
21680 case CallingConv::C:
21681 case CallingConv::X86_StdCall: {
21682 // Pass 'nest' parameter in ECX.
21683 // Must be kept in sync with X86CallingConv.td
21684 NestReg = X86::ECX;
21686 // Check that ECX wasn't needed by an 'inreg' parameter.
21687 FunctionType *FTy = Func->getFunctionType();
21688 const AttributeList &Attrs = Func->getAttributes();
21690 if (!Attrs.isEmpty() && !Func->isVarArg()) {
21691 unsigned InRegCount = 0;
21694 for (FunctionType::param_iterator I = FTy->param_begin(),
21695 E = FTy->param_end(); I != E; ++I, ++Idx)
21696 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21697 auto &DL = DAG.getDataLayout();
21698 // FIXME: should only count parameters that are lowered to integers.
21699 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21702 if (InRegCount > 2) {
21703 report_fatal_error("Nest register in use - reduce number of inreg"
21709 case CallingConv::X86_FastCall:
21710 case CallingConv::X86_ThisCall:
21711 case CallingConv::Fast:
21712 // Pass 'nest' parameter in EAX.
21713 // Must be kept in sync with X86CallingConv.td
21714 NestReg = X86::EAX;
21718 SDValue OutChains[4];
21719 SDValue Addr, Disp;
21721 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21722 DAG.getConstant(10, dl, MVT::i32));
21723 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21725 // This is storing the opcode for MOV32ri.
21726 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21727 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21729 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21730 Trmp, MachinePointerInfo(TrmpAddr));
21732 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21733 DAG.getConstant(1, dl, MVT::i32));
21735 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21736 /* Alignment = */ 1);
21738 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21739 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21740 DAG.getConstant(5, dl, MVT::i32));
21741 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21742 Addr, MachinePointerInfo(TrmpAddr, 5),
21743 /* Alignment = */ 1);
21745 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21746 DAG.getConstant(6, dl, MVT::i32));
21748 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21749 /* Alignment = */ 1);
21751 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21755 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21756 SelectionDAG &DAG) const {
21758 The rounding mode is in bits 11:10 of FPSR, and has the following
21760 00 Round to nearest
21765 FLT_ROUNDS, on the other hand, expects the following:
21772 To perform the conversion, we do:
21773 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21776 MachineFunction &MF = DAG.getMachineFunction();
21777 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21778 unsigned StackAlignment = TFI.getStackAlignment();
21779 MVT VT = Op.getSimpleValueType();
21782 // Save FP Control Word to stack slot
21783 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21784 SDValue StackSlot =
21785 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21787 MachineMemOperand *MMO =
21788 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21789 MachineMemOperand::MOStore, 2, 2);
21791 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21792 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21793 DAG.getVTList(MVT::Other),
21794 Ops, MVT::i16, MMO);
21796 // Load FP Control Word from stack slot
21798 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21800 // Transform as necessary
21802 DAG.getNode(ISD::SRL, DL, MVT::i16,
21803 DAG.getNode(ISD::AND, DL, MVT::i16,
21804 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21805 DAG.getConstant(11, DL, MVT::i8));
21807 DAG.getNode(ISD::SRL, DL, MVT::i16,
21808 DAG.getNode(ISD::AND, DL, MVT::i16,
21809 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21810 DAG.getConstant(9, DL, MVT::i8));
21813 DAG.getNode(ISD::AND, DL, MVT::i16,
21814 DAG.getNode(ISD::ADD, DL, MVT::i16,
21815 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21816 DAG.getConstant(1, DL, MVT::i16)),
21817 DAG.getConstant(3, DL, MVT::i16));
21819 return DAG.getNode((VT.getSizeInBits() < 16 ?
21820 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
21823 // Split an unary integer op into 2 half sized ops.
21824 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21825 MVT VT = Op.getSimpleValueType();
21826 unsigned NumElems = VT.getVectorNumElements();
21827 unsigned SizeInBits = VT.getSizeInBits();
21829 // Extract the Lo/Hi vectors
21831 SDValue Src = Op.getOperand(0);
21832 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21833 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21835 MVT EltVT = VT.getVectorElementType();
21836 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21837 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21838 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21839 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21842 // Decompose 256-bit ops into smaller 128-bit ops.
21843 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21844 assert(Op.getSimpleValueType().is256BitVector() &&
21845 Op.getSimpleValueType().isInteger() &&
21846 "Only handle AVX 256-bit vector integer operation");
21847 return LowerVectorIntUnary(Op, DAG);
21850 // Decompose 512-bit ops into smaller 256-bit ops.
21851 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21852 assert(Op.getSimpleValueType().is512BitVector() &&
21853 Op.getSimpleValueType().isInteger() &&
21854 "Only handle AVX 512-bit vector integer operation");
21855 return LowerVectorIntUnary(Op, DAG);
21858 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21860 // i8/i16 vector implemented using dword LZCNT vector instruction
21861 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21862 // split the vector, perform operation on it's Lo a Hi part and
21863 // concatenate the results.
21864 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21865 assert(Op.getOpcode() == ISD::CTLZ);
21867 MVT VT = Op.getSimpleValueType();
21868 MVT EltVT = VT.getVectorElementType();
21869 unsigned NumElems = VT.getVectorNumElements();
21871 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21872 "Unsupported element type");
21874 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21876 return LowerVectorIntUnary(Op, DAG);
21878 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21879 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21880 "Unsupported value type for operation");
21882 // Use native supported vector instruction vplzcntd.
21883 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21884 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21885 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21886 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21888 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21891 // Lower CTLZ using a PSHUFB lookup table implementation.
21892 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21893 const X86Subtarget &Subtarget,
21894 SelectionDAG &DAG) {
21895 MVT VT = Op.getSimpleValueType();
21896 int NumElts = VT.getVectorNumElements();
21897 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21898 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21900 // Per-nibble leading zero PSHUFB lookup table.
21901 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21902 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21903 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21904 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21906 SmallVector<SDValue, 64> LUTVec;
21907 for (int i = 0; i < NumBytes; ++i)
21908 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21909 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21911 // Begin by bitcasting the input to byte vector, then split those bytes
21912 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21913 // If the hi input nibble is zero then we add both results together, otherwise
21914 // we just take the hi result (by masking the lo result to zero before the
21916 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21917 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21919 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21920 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21921 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21922 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21924 if (CurrVT.is512BitVector()) {
21925 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
21926 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
21927 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
21929 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21932 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21933 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21934 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21935 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21937 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21938 // of the current vector width in the same way we did for the nibbles.
21939 // If the upper half of the input element is zero then add the halves'
21940 // leading zero counts together, otherwise just use the upper half's.
21941 // Double the width of the result until we are at target width.
21942 while (CurrVT != VT) {
21943 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21944 int CurrNumElts = CurrVT.getVectorNumElements();
21945 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21946 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21947 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21949 // Check if the upper half of the input element is zero.
21950 if (CurrVT.is512BitVector()) {
21951 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
21952 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
21953 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21954 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
21956 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21957 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21959 HiZ = DAG.getBitcast(NextVT, HiZ);
21961 // Move the upper/lower halves to the lower bits as we'll be extending to
21962 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21964 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21965 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21966 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21967 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21968 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21975 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21976 const X86Subtarget &Subtarget,
21977 SelectionDAG &DAG) {
21978 MVT VT = Op.getSimpleValueType();
21980 if (Subtarget.hasCDI())
21981 return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21983 // Decompose 256-bit ops into smaller 128-bit ops.
21984 if (VT.is256BitVector() && !Subtarget.hasInt256())
21985 return Lower256IntUnary(Op, DAG);
21987 // Decompose 512-bit ops into smaller 256-bit ops.
21988 if (VT.is512BitVector() && !Subtarget.hasBWI())
21989 return Lower512IntUnary(Op, DAG);
21991 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21992 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21995 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21996 SelectionDAG &DAG) {
21997 MVT VT = Op.getSimpleValueType();
21999 unsigned NumBits = VT.getSizeInBits();
22001 unsigned Opc = Op.getOpcode();
22004 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
22006 Op = Op.getOperand(0);
22007 if (VT == MVT::i8) {
22008 // Zero extend to i32 since there is not an i8 bsr.
22010 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
22013 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
22014 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
22015 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
22017 if (Opc == ISD::CTLZ) {
22018 // If src is zero (i.e. bsr sets ZF), returns NumBits.
22021 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
22022 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22025 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
22028 // Finally xor with NumBits-1.
22029 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
22030 DAG.getConstant(NumBits - 1, dl, OpVT));
22033 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
22037 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
22038 MVT VT = Op.getSimpleValueType();
22039 unsigned NumBits = VT.getScalarSizeInBits();
22042 if (VT.isVector()) {
22043 SDValue N0 = Op.getOperand(0);
22044 SDValue Zero = DAG.getConstant(0, dl, VT);
22046 // lsb(x) = (x & -x)
22047 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
22048 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
22050 // cttz_undef(x) = (width - 1) - ctlz(lsb)
22051 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
22052 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
22053 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
22054 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
22057 // cttz(x) = ctpop(lsb - 1)
22058 SDValue One = DAG.getConstant(1, dl, VT);
22059 return DAG.getNode(ISD::CTPOP, dl, VT,
22060 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
22063 assert(Op.getOpcode() == ISD::CTTZ &&
22064 "Only scalar CTTZ requires custom lowering");
22066 // Issue a bsf (scan bits forward) which also sets EFLAGS.
22067 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22068 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
22070 // If src is zero (i.e. bsf sets ZF), returns NumBits.
22073 DAG.getConstant(NumBits, dl, VT),
22074 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22077 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
22080 /// Break a 256-bit integer operation into two new 128-bit ones and then
22081 /// concatenate the result back.
22082 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
22083 MVT VT = Op.getSimpleValueType();
22085 assert(VT.is256BitVector() && VT.isInteger() &&
22086 "Unsupported value type for operation");
22088 unsigned NumElems = VT.getVectorNumElements();
22091 // Extract the LHS vectors
22092 SDValue LHS = Op.getOperand(0);
22093 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
22094 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
22096 // Extract the RHS vectors
22097 SDValue RHS = Op.getOperand(1);
22098 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
22099 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
22101 MVT EltVT = VT.getVectorElementType();
22102 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22104 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22105 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22106 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22109 /// Break a 512-bit integer operation into two new 256-bit ones and then
22110 /// concatenate the result back.
22111 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
22112 MVT VT = Op.getSimpleValueType();
22114 assert(VT.is512BitVector() && VT.isInteger() &&
22115 "Unsupported value type for operation");
22117 unsigned NumElems = VT.getVectorNumElements();
22120 // Extract the LHS vectors
22121 SDValue LHS = Op.getOperand(0);
22122 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
22123 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
22125 // Extract the RHS vectors
22126 SDValue RHS = Op.getOperand(1);
22127 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
22128 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
22130 MVT EltVT = VT.getVectorElementType();
22131 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22133 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22134 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22135 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22138 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
22139 MVT VT = Op.getSimpleValueType();
22140 if (VT.getScalarType() == MVT::i1)
22141 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
22142 Op.getOperand(0), Op.getOperand(1));
22143 assert(Op.getSimpleValueType().is256BitVector() &&
22144 Op.getSimpleValueType().isInteger() &&
22145 "Only handle AVX 256-bit vector integer operation");
22146 return Lower256IntArith(Op, DAG);
22149 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
22150 MVT VT = Op.getSimpleValueType();
22151 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
22152 // Since X86 does not have CMOV for 8-bit integer, we don't convert
22153 // 8-bit integer abs to NEG and CMOV.
22155 SDValue N0 = Op.getOperand(0);
22156 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
22157 DAG.getConstant(0, DL, VT), N0);
22158 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
22159 SDValue(Neg.getNode(), 1)};
22160 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
22163 assert(Op.getSimpleValueType().is256BitVector() &&
22164 Op.getSimpleValueType().isInteger() &&
22165 "Only handle AVX 256-bit vector integer operation");
22166 return Lower256IntUnary(Op, DAG);
22169 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
22170 assert(Op.getSimpleValueType().is256BitVector() &&
22171 Op.getSimpleValueType().isInteger() &&
22172 "Only handle AVX 256-bit vector integer operation");
22173 return Lower256IntArith(Op, DAG);
22176 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
22177 SelectionDAG &DAG) {
22179 MVT VT = Op.getSimpleValueType();
22181 if (VT.getScalarType() == MVT::i1)
22182 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
22184 // Decompose 256-bit ops into smaller 128-bit ops.
22185 if (VT.is256BitVector() && !Subtarget.hasInt256())
22186 return Lower256IntArith(Op, DAG);
22188 SDValue A = Op.getOperand(0);
22189 SDValue B = Op.getOperand(1);
22191 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
22192 // vector pairs, multiply and truncate.
22193 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
22194 if (Subtarget.hasInt256()) {
22195 // For 512-bit vectors, split into 256-bit vectors to allow the
22196 // sign-extension to occur.
22197 if (VT == MVT::v64i8)
22198 return Lower512IntArith(Op, DAG);
22200 // For 256-bit vectors, split into 128-bit vectors to allow the
22201 // sign-extension to occur. We don't need this on AVX512BW as we can
22202 // safely sign-extend to v32i16.
22203 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
22204 return Lower256IntArith(Op, DAG);
22206 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
22207 return DAG.getNode(
22208 ISD::TRUNCATE, dl, VT,
22209 DAG.getNode(ISD::MUL, dl, ExVT,
22210 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
22211 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
22214 assert(VT == MVT::v16i8 &&
22215 "Pre-AVX2 support only supports v16i8 multiplication");
22216 MVT ExVT = MVT::v8i16;
22218 // Extract the lo parts and sign extend to i16
22220 if (Subtarget.hasSSE41()) {
22221 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
22222 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
22224 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22225 -1, 4, -1, 5, -1, 6, -1, 7};
22226 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22227 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22228 ALo = DAG.getBitcast(ExVT, ALo);
22229 BLo = DAG.getBitcast(ExVT, BLo);
22230 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22231 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22234 // Extract the hi parts and sign extend to i16
22236 if (Subtarget.hasSSE41()) {
22237 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22238 -1, -1, -1, -1, -1, -1, -1, -1};
22239 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22240 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22241 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
22242 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
22244 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22245 -1, 12, -1, 13, -1, 14, -1, 15};
22246 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22247 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22248 AHi = DAG.getBitcast(ExVT, AHi);
22249 BHi = DAG.getBitcast(ExVT, BHi);
22250 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22251 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22254 // Multiply, mask the lower 8bits of the lo/hi results and pack
22255 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22256 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22257 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
22258 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
22259 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22262 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
22263 if (VT == MVT::v4i32) {
22264 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
22265 "Should not custom lower when pmulld is available!");
22267 // If the upper 17 bits of each element are zero then we can use PMADD.
22268 APInt Mask17 = APInt::getHighBitsSet(32, 17);
22269 if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17))
22270 return DAG.getNode(X86ISD::VPMADDWD, dl, VT,
22271 DAG.getBitcast(MVT::v8i16, A),
22272 DAG.getBitcast(MVT::v8i16, B));
22274 // Extract the odd parts.
22275 static const int UnpackMask[] = { 1, -1, 3, -1 };
22276 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
22277 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
22279 // Multiply the even parts.
22280 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
22281 // Now multiply odd parts.
22282 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
22284 Evens = DAG.getBitcast(VT, Evens);
22285 Odds = DAG.getBitcast(VT, Odds);
22287 // Merge the two vectors back together with a shuffle. This expands into 2
22289 static const int ShufMask[] = { 0, 4, 2, 6 };
22290 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
22293 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
22294 "Only know how to lower V2I64/V4I64/V8I64 multiply");
22296 // 32-bit vector types used for MULDQ/MULUDQ.
22297 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22299 // MULDQ returns the 64-bit result of the signed multiplication of the lower
22300 // 32-bits. We can lower with this if the sign bits stretch that far.
22301 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
22302 DAG.ComputeNumSignBits(B) > 32) {
22303 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
22304 DAG.getBitcast(MulVT, B));
22307 // Ahi = psrlqi(a, 32);
22308 // Bhi = psrlqi(b, 32);
22310 // AloBlo = pmuludq(a, b);
22311 // AloBhi = pmuludq(a, Bhi);
22312 // AhiBlo = pmuludq(Ahi, b);
22314 // Hi = psllqi(AloBhi + AhiBlo, 32);
22315 // return AloBlo + Hi;
22316 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
22317 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
22318 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
22320 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
22321 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
22322 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
22324 // If DQI is supported we can use MULLQ, but MULUDQ is still better if the
22325 // the high bits are known to be zero.
22326 if (Subtarget.hasDQI() && (!AHiIsZero || !BHiIsZero))
22329 // Bit cast to 32-bit vectors for MULUDQ.
22330 SDValue Alo = DAG.getBitcast(MulVT, A);
22331 SDValue Blo = DAG.getBitcast(MulVT, B);
22333 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22335 // Only multiply lo/hi halves that aren't known to be zero.
22336 SDValue AloBlo = Zero;
22337 if (!ALoIsZero && !BLoIsZero)
22338 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
22340 SDValue AloBhi = Zero;
22341 if (!ALoIsZero && !BHiIsZero) {
22342 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
22343 Bhi = DAG.getBitcast(MulVT, Bhi);
22344 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
22347 SDValue AhiBlo = Zero;
22348 if (!AHiIsZero && !BLoIsZero) {
22349 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
22350 Ahi = DAG.getBitcast(MulVT, Ahi);
22351 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
22354 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
22355 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
22357 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
22360 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
22361 SelectionDAG &DAG) {
22363 MVT VT = Op.getSimpleValueType();
22365 // Decompose 256-bit ops into smaller 128-bit ops.
22366 if (VT.is256BitVector() && !Subtarget.hasInt256())
22367 return Lower256IntArith(Op, DAG);
22369 // Only i8 vectors should need custom lowering after this.
22370 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
22371 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
22372 "Unsupported vector type");
22374 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
22375 // logical shift down the upper half and pack back to i8.
22376 SDValue A = Op.getOperand(0);
22377 SDValue B = Op.getOperand(1);
22379 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
22380 // and then ashr/lshr the upper bits down to the lower bits before multiply.
22381 unsigned Opcode = Op.getOpcode();
22382 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
22383 unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
22385 // For 512-bit vectors, split into 256-bit vectors to allow the
22386 // sign-extension to occur.
22387 if (VT == MVT::v64i8)
22388 return Lower512IntArith(Op, DAG);
22390 // AVX2 implementations - extend xmm subvectors to ymm.
22391 if (Subtarget.hasInt256()) {
22392 unsigned NumElems = VT.getVectorNumElements();
22393 SDValue Lo = DAG.getIntPtrConstant(0, dl);
22394 SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
22396 if (VT == MVT::v32i8) {
22397 if (Subtarget.hasBWI()) {
22398 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
22399 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
22400 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
22401 Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
22402 DAG.getConstant(8, dl, MVT::v32i16));
22403 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22405 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
22406 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
22407 SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
22408 SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
22409 ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
22410 BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
22411 AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
22412 BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
22413 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22414 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
22415 DAG.getConstant(8, dl, MVT::v16i16));
22416 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22417 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
22418 DAG.getConstant(8, dl, MVT::v16i16));
22419 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
22420 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
22421 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
22422 16, 17, 18, 19, 20, 21, 22, 23};
22423 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22424 24, 25, 26, 27, 28, 29, 30, 31};
22425 return DAG.getNode(X86ISD::PACKUS, dl, VT,
22426 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
22427 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
22430 assert(VT == MVT::v16i8 && "Unexpected VT");
22432 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
22433 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
22434 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
22435 Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
22436 DAG.getConstant(8, dl, MVT::v16i16));
22437 // If we have BWI we can use truncate instruction.
22438 if (Subtarget.hasBWI())
22439 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22440 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
22441 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
22442 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22445 assert(VT == MVT::v16i8 &&
22446 "Pre-AVX2 support only supports v16i8 multiplication");
22447 MVT ExVT = MVT::v8i16;
22448 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
22450 // Extract the lo parts and zero/sign extend to i16.
22452 if (Subtarget.hasSSE41()) {
22453 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
22454 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
22456 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22457 -1, 4, -1, 5, -1, 6, -1, 7};
22458 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22459 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22460 ALo = DAG.getBitcast(ExVT, ALo);
22461 BLo = DAG.getBitcast(ExVT, BLo);
22462 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22463 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22466 // Extract the hi parts and zero/sign extend to i16.
22468 if (Subtarget.hasSSE41()) {
22469 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22470 -1, -1, -1, -1, -1, -1, -1, -1};
22471 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22472 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22473 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
22474 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
22476 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22477 -1, 12, -1, 13, -1, 14, -1, 15};
22478 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22479 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22480 AHi = DAG.getBitcast(ExVT, AHi);
22481 BHi = DAG.getBitcast(ExVT, BHi);
22482 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22483 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22486 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
22487 // pack back to v16i8.
22488 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22489 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22490 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
22491 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
22492 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22495 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
22496 assert(Subtarget.isTargetWin64() && "Unexpected target");
22497 EVT VT = Op.getValueType();
22498 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
22499 "Unexpected return type for lowering");
22503 switch (Op->getOpcode()) {
22504 default: llvm_unreachable("Unexpected request for libcall!");
22505 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
22506 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
22507 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
22508 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
22509 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
22510 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
22514 SDValue InChain = DAG.getEntryNode();
22516 TargetLowering::ArgListTy Args;
22517 TargetLowering::ArgListEntry Entry;
22518 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
22519 EVT ArgVT = Op->getOperand(i).getValueType();
22520 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
22521 "Unexpected argument type for lowering");
22522 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
22523 Entry.Node = StackPtr;
22524 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
22525 MachinePointerInfo(), /* Alignment = */ 16);
22526 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22527 Entry.Ty = PointerType::get(ArgTy,0);
22528 Entry.IsSExt = false;
22529 Entry.IsZExt = false;
22530 Args.push_back(Entry);
22533 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
22534 getPointerTy(DAG.getDataLayout()));
22536 TargetLowering::CallLoweringInfo CLI(DAG);
22537 CLI.setDebugLoc(dl)
22540 getLibcallCallingConv(LC),
22541 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
22544 .setSExtResult(isSigned)
22545 .setZExtResult(!isSigned);
22547 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
22548 return DAG.getBitcast(VT, CallInfo.first);
22551 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
22552 SelectionDAG &DAG) {
22553 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
22554 MVT VT = Op0.getSimpleValueType();
22557 // Decompose 256-bit ops into smaller 128-bit ops.
22558 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22559 unsigned Opcode = Op.getOpcode();
22560 unsigned NumElems = VT.getVectorNumElements();
22561 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
22562 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
22563 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
22564 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
22565 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
22566 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
22567 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
22569 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
22570 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
22572 return DAG.getMergeValues(Ops, dl);
22575 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
22576 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
22577 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
22579 int NumElts = VT.getVectorNumElements();
22581 // PMULxD operations multiply each even value (starting at 0) of LHS with
22582 // the related value of RHS and produce a widen result.
22583 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22584 // => <2 x i64> <ae|cg>
22586 // In other word, to have all the results, we need to perform two PMULxD:
22587 // 1. one with the even values.
22588 // 2. one with the odd values.
22589 // To achieve #2, with need to place the odd values at an even position.
22591 // Place the odd value at an even position (basically, shift all values 1
22592 // step to the left):
22593 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
22594 // <a|b|c|d> => <b|undef|d|undef>
22595 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
22596 makeArrayRef(&Mask[0], NumElts));
22597 // <e|f|g|h> => <f|undef|h|undef>
22598 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
22599 makeArrayRef(&Mask[0], NumElts));
22601 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
22603 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
22604 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
22606 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
22607 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22608 // => <2 x i64> <ae|cg>
22609 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
22610 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
22611 // => <2 x i64> <bf|dh>
22612 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
22614 // Shuffle it back into the right order.
22615 SmallVector<int, 16> HighMask(NumElts);
22616 SmallVector<int, 16> LowMask(NumElts);
22617 for (int i = 0; i != NumElts; ++i) {
22618 HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
22619 LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
22622 SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
22623 SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
22625 // If we have a signed multiply but no PMULDQ fix up the high parts of a
22626 // unsigned multiply.
22627 if (IsSigned && !Subtarget.hasSSE41()) {
22628 SDValue ShAmt = DAG.getConstant(
22630 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
22631 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
22632 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
22633 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
22634 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
22636 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
22637 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
22640 // The first result of MUL_LOHI is actually the low value, followed by the
22642 SDValue Ops[] = {Lows, Highs};
22643 return DAG.getMergeValues(Ops, dl);
22646 // Return true if the required (according to Opcode) shift-imm form is natively
22647 // supported by the Subtarget
22648 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
22650 if (VT.getScalarSizeInBits() < 16)
22653 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
22654 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
22657 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
22658 (VT.is256BitVector() && Subtarget.hasInt256());
22660 bool AShift = LShift && (Subtarget.hasAVX512() ||
22661 (VT != MVT::v2i64 && VT != MVT::v4i64));
22662 return (Opcode == ISD::SRA) ? AShift : LShift;
22665 // The shift amount is a variable, but it is the same for all vector lanes.
22666 // These instructions are defined together with shift-immediate.
22668 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
22670 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
22673 // Return true if the required (according to Opcode) variable-shift form is
22674 // natively supported by the Subtarget
22675 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
22678 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
22681 // vXi16 supported only on AVX-512, BWI
22682 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
22685 if (Subtarget.hasAVX512())
22688 bool LShift = VT.is128BitVector() || VT.is256BitVector();
22689 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
22690 return (Opcode == ISD::SRA) ? AShift : LShift;
22693 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
22694 const X86Subtarget &Subtarget) {
22695 MVT VT = Op.getSimpleValueType();
22697 SDValue R = Op.getOperand(0);
22698 SDValue Amt = Op.getOperand(1);
22700 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22701 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22703 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
22704 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
22705 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
22706 SDValue Ex = DAG.getBitcast(ExVT, R);
22708 // ashr(R, 63) === cmp_slt(R, 0)
22709 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
22710 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
22711 "Unsupported PCMPGT op");
22712 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
22713 getZeroVector(VT, Subtarget, DAG, dl), R);
22716 if (ShiftAmt >= 32) {
22717 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
22719 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
22720 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22721 ShiftAmt - 32, DAG);
22722 if (VT == MVT::v2i64)
22723 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
22724 if (VT == MVT::v4i64)
22725 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22726 {9, 1, 11, 3, 13, 5, 15, 7});
22728 // SRA upper i32, SHL whole i64 and select lower i32.
22729 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22732 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
22733 Lower = DAG.getBitcast(ExVT, Lower);
22734 if (VT == MVT::v2i64)
22735 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
22736 if (VT == MVT::v4i64)
22737 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22738 {8, 1, 10, 3, 12, 5, 14, 7});
22740 return DAG.getBitcast(VT, Ex);
22743 // Optimize shl/srl/sra with constant shift amount.
22744 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22745 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
22746 uint64_t ShiftAmt = ShiftConst->getZExtValue();
22748 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22749 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22751 // i64 SRA needs to be performed as partial shifts.
22752 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
22753 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
22754 Op.getOpcode() == ISD::SRA)
22755 return ArithmeticShiftRight64(ShiftAmt);
22757 if (VT == MVT::v16i8 ||
22758 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
22759 VT == MVT::v64i8) {
22760 unsigned NumElts = VT.getVectorNumElements();
22761 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
22763 // Simple i8 add case
22764 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
22765 return DAG.getNode(ISD::ADD, dl, VT, R, R);
22767 // ashr(R, 7) === cmp_slt(R, 0)
22768 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
22769 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22770 if (VT.is512BitVector()) {
22771 assert(VT == MVT::v64i8 && "Unexpected element type!");
22772 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
22773 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22775 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
22778 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
22779 if (VT == MVT::v16i8 && Subtarget.hasXOP())
22782 if (Op.getOpcode() == ISD::SHL) {
22783 // Make a large shift.
22784 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22786 SHL = DAG.getBitcast(VT, SHL);
22787 // Zero out the rightmost bits.
22788 return DAG.getNode(ISD::AND, dl, VT, SHL,
22789 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22791 if (Op.getOpcode() == ISD::SRL) {
22792 // Make a large shift.
22793 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22795 SRL = DAG.getBitcast(VT, SRL);
22796 // Zero out the leftmost bits.
22797 return DAG.getNode(ISD::AND, dl, VT, SRL,
22798 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22800 if (Op.getOpcode() == ISD::SRA) {
22801 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22802 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22804 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22805 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22806 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22809 llvm_unreachable("Unknown shift opcode.");
22814 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
22815 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22816 if (!Subtarget.hasXOP() &&
22817 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
22818 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
22820 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22821 unsigned SubVectorScale = 1;
22822 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22824 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22825 Amt = Amt.getOperand(0);
22828 // Peek through any splat that was introduced for i64 shift vectorization.
22829 int SplatIndex = -1;
22830 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22831 if (SVN->isSplat()) {
22832 SplatIndex = SVN->getSplatIndex();
22833 Amt = Amt.getOperand(0);
22834 assert(SplatIndex < (int)VT.getVectorNumElements() &&
22835 "Splat shuffle referencing second operand");
22838 if (Amt.getOpcode() != ISD::BITCAST ||
22839 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22842 Amt = Amt.getOperand(0);
22843 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22844 (SubVectorScale * VT.getVectorNumElements());
22845 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22846 uint64_t ShiftAmt = 0;
22847 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
22848 for (unsigned i = 0; i != Ratio; ++i) {
22849 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22853 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22856 // Check remaining shift amounts (if not a splat).
22857 if (SplatIndex < 0) {
22858 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22859 uint64_t ShAmt = 0;
22860 for (unsigned j = 0; j != Ratio; ++j) {
22861 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22865 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22867 if (ShAmt != ShiftAmt)
22872 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22873 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22875 if (Op.getOpcode() == ISD::SRA)
22876 return ArithmeticShiftRight64(ShiftAmt);
22882 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22883 const X86Subtarget &Subtarget) {
22884 MVT VT = Op.getSimpleValueType();
22886 SDValue R = Op.getOperand(0);
22887 SDValue Amt = Op.getOperand(1);
22889 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22890 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22892 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22893 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
22895 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
22897 MVT EltVT = VT.getVectorElementType();
22899 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22900 // Check if this build_vector node is doing a splat.
22901 // If so, then set BaseShAmt equal to the splat value.
22902 BaseShAmt = BV->getSplatValue();
22903 if (BaseShAmt && BaseShAmt.isUndef())
22904 BaseShAmt = SDValue();
22906 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22907 Amt = Amt.getOperand(0);
22909 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22910 if (SVN && SVN->isSplat()) {
22911 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22912 SDValue InVec = Amt.getOperand(0);
22913 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22914 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22915 "Unexpected shuffle index found!");
22916 BaseShAmt = InVec.getOperand(SplatIdx);
22917 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22918 if (ConstantSDNode *C =
22919 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22920 if (C->getZExtValue() == SplatIdx)
22921 BaseShAmt = InVec.getOperand(1);
22926 // Avoid introducing an extract element from a shuffle.
22927 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22928 DAG.getIntPtrConstant(SplatIdx, dl));
22932 if (BaseShAmt.getNode()) {
22933 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22934 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22935 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22936 else if (EltVT.bitsLT(MVT::i32))
22937 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22939 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22943 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
22944 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
22945 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22946 Amt = Amt.getOperand(0);
22947 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22948 VT.getVectorNumElements();
22949 std::vector<SDValue> Vals(Ratio);
22950 for (unsigned i = 0; i != Ratio; ++i)
22951 Vals[i] = Amt.getOperand(i);
22952 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22953 for (unsigned j = 0; j != Ratio; ++j)
22954 if (Vals[j] != Amt.getOperand(i + j))
22958 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22959 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22964 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22965 SelectionDAG &DAG) {
22966 MVT VT = Op.getSimpleValueType();
22968 SDValue R = Op.getOperand(0);
22969 SDValue Amt = Op.getOperand(1);
22970 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22972 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22973 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22975 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22978 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22981 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22984 // XOP has 128-bit variable logical/arithmetic shifts.
22985 // +ve/-ve Amt = shift left/right.
22986 if (Subtarget.hasXOP() &&
22987 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22988 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22989 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22990 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22991 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22993 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22994 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22995 if (Op.getOpcode() == ISD::SRA)
22996 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22999 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
23000 // shifts per-lane and then shuffle the partial results back together.
23001 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
23002 // Splat the shift amounts so the scalar shifts above will catch it.
23003 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
23004 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
23005 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
23006 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
23007 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
23010 // i64 vector arithmetic shift can be emulated with the transform:
23011 // M = lshr(SIGN_MASK, Amt)
23012 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
23013 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
23014 Op.getOpcode() == ISD::SRA) {
23015 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
23016 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
23017 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23018 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
23019 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
23023 // If possible, lower this packed shift into a vector multiply instead of
23024 // expanding it into a sequence of scalar shifts.
23025 // Do this only if the vector shift count is a constant build_vector.
23026 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
23027 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
23028 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
23029 SmallVector<SDValue, 8> Elts;
23030 MVT SVT = VT.getVectorElementType();
23031 unsigned SVTBits = SVT.getSizeInBits();
23032 APInt One(SVTBits, 1);
23033 unsigned NumElems = VT.getVectorNumElements();
23035 for (unsigned i=0; i !=NumElems; ++i) {
23036 SDValue Op = Amt->getOperand(i);
23037 if (Op->isUndef()) {
23038 Elts.push_back(Op);
23042 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
23043 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
23044 uint64_t ShAmt = C.getZExtValue();
23045 if (ShAmt >= SVTBits) {
23046 Elts.push_back(DAG.getUNDEF(SVT));
23049 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
23051 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
23052 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
23055 // Lower SHL with variable shift amount.
23056 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
23057 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
23059 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
23060 DAG.getConstant(0x3f800000U, dl, VT));
23061 Op = DAG.getBitcast(MVT::v4f32, Op);
23062 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
23063 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
23066 // If possible, lower this shift as a sequence of two shifts by
23067 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
23069 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
23071 // Could be rewritten as:
23072 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
23074 // The advantage is that the two shifts from the example would be
23075 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
23076 // the vector shift into four scalar shifts plus four pairs of vector
23078 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
23079 bool UseMOVSD = false;
23080 bool CanBeSimplified;
23081 // The splat value for the first packed shift (the 'X' from the example).
23082 SDValue Amt1 = Amt->getOperand(0);
23083 // The splat value for the second packed shift (the 'Y' from the example).
23084 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
23086 // See if it is possible to replace this node with a sequence of
23087 // two shifts followed by a MOVSS/MOVSD/PBLEND.
23088 if (VT == MVT::v4i32) {
23089 // Check if it is legal to use a MOVSS.
23090 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
23091 Amt2 == Amt->getOperand(3);
23092 if (!CanBeSimplified) {
23093 // Otherwise, check if we can still simplify this node using a MOVSD.
23094 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
23095 Amt->getOperand(2) == Amt->getOperand(3);
23097 Amt2 = Amt->getOperand(2);
23100 // Do similar checks for the case where the machine value type
23102 CanBeSimplified = Amt1 == Amt->getOperand(1);
23103 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
23104 CanBeSimplified = Amt2 == Amt->getOperand(i);
23106 if (!CanBeSimplified) {
23108 CanBeSimplified = true;
23109 Amt2 = Amt->getOperand(4);
23110 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
23111 CanBeSimplified = Amt1 == Amt->getOperand(i);
23112 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
23113 CanBeSimplified = Amt2 == Amt->getOperand(j);
23117 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
23118 isa<ConstantSDNode>(Amt2)) {
23119 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
23121 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
23122 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
23124 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
23125 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
23126 SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
23127 SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
23129 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
23130 BitCast2, {0, 1, 6, 7}));
23131 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
23132 BitCast2, {0, 5, 6, 7}));
23136 // v4i32 Non Uniform Shifts.
23137 // If the shift amount is constant we can shift each lane using the SSE2
23138 // immediate shifts, else we need to zero-extend each lane to the lower i64
23139 // and shift using the SSE2 variable shifts.
23140 // The separate results can then be blended together.
23141 if (VT == MVT::v4i32) {
23142 unsigned Opc = Op.getOpcode();
23143 SDValue Amt0, Amt1, Amt2, Amt3;
23145 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
23146 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
23147 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
23148 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
23150 // ISD::SHL is handled above but we include it here for completeness.
23153 llvm_unreachable("Unknown target vector shift node");
23155 Opc = X86ISD::VSHL;
23158 Opc = X86ISD::VSRL;
23161 Opc = X86ISD::VSRA;
23164 // The SSE2 shifts use the lower i64 as the same shift amount for
23165 // all lanes and the upper i64 is ignored. These shuffle masks
23166 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
23167 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23168 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
23169 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
23170 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
23171 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
23174 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
23175 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
23176 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
23177 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
23178 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
23179 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
23180 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
23183 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
23184 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
23185 // make the existing SSE solution better.
23186 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
23187 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
23188 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
23189 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
23190 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
23191 "Unexpected vector type");
23192 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
23193 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
23195 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23196 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
23197 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
23198 return DAG.getNode(ISD::TRUNCATE, dl, VT,
23199 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
23202 if (VT == MVT::v16i8 ||
23203 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
23204 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
23205 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23206 unsigned ShiftOpcode = Op->getOpcode();
23208 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23209 if (VT.is512BitVector()) {
23210 // On AVX512BW targets we make use of the fact that VSELECT lowers
23211 // to a masked blend which selects bytes based just on the sign bit
23212 // extracted to a mask.
23213 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
23214 V0 = DAG.getBitcast(VT, V0);
23215 V1 = DAG.getBitcast(VT, V1);
23216 Sel = DAG.getBitcast(VT, Sel);
23217 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
23218 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23219 } else if (Subtarget.hasSSE41()) {
23220 // On SSE41 targets we make use of the fact that VSELECT lowers
23221 // to PBLENDVB which selects bytes based just on the sign bit.
23222 V0 = DAG.getBitcast(VT, V0);
23223 V1 = DAG.getBitcast(VT, V1);
23224 Sel = DAG.getBitcast(VT, Sel);
23225 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23227 // On pre-SSE41 targets we test for the sign bit by comparing to
23228 // zero - a negative value will set all bits of the lanes to true
23229 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23230 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
23231 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
23232 return DAG.getSelect(dl, SelVT, C, V0, V1);
23235 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23236 // We can safely do this using i16 shifts as we're only interested in
23237 // the 3 lower bits of each byte.
23238 Amt = DAG.getBitcast(ExtVT, Amt);
23239 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
23240 Amt = DAG.getBitcast(VT, Amt);
23242 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
23243 // r = VSELECT(r, shift(r, 4), a);
23245 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23246 R = SignBitSelect(VT, Amt, M, R);
23249 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23251 // r = VSELECT(r, shift(r, 2), a);
23252 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23253 R = SignBitSelect(VT, Amt, M, R);
23256 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23258 // return VSELECT(r, shift(r, 1), a);
23259 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23260 R = SignBitSelect(VT, Amt, M, R);
23264 if (Op->getOpcode() == ISD::SRA) {
23265 // For SRA we need to unpack each byte to the higher byte of a i16 vector
23266 // so we can correctly sign extend. We don't care what happens to the
23268 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
23269 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
23270 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
23271 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
23272 ALo = DAG.getBitcast(ExtVT, ALo);
23273 AHi = DAG.getBitcast(ExtVT, AHi);
23274 RLo = DAG.getBitcast(ExtVT, RLo);
23275 RHi = DAG.getBitcast(ExtVT, RHi);
23277 // r = VSELECT(r, shift(r, 4), a);
23278 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23279 DAG.getConstant(4, dl, ExtVT));
23280 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23281 DAG.getConstant(4, dl, ExtVT));
23282 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23283 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23286 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23287 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23289 // r = VSELECT(r, shift(r, 2), a);
23290 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23291 DAG.getConstant(2, dl, ExtVT));
23292 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23293 DAG.getConstant(2, dl, ExtVT));
23294 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23295 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23298 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23299 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23301 // r = VSELECT(r, shift(r, 1), a);
23302 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23303 DAG.getConstant(1, dl, ExtVT));
23304 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23305 DAG.getConstant(1, dl, ExtVT));
23306 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23307 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23309 // Logical shift the result back to the lower byte, leaving a zero upper
23311 // meaning that we can safely pack with PACKUSWB.
23313 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
23315 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
23316 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23320 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
23321 MVT ExtVT = MVT::v8i32;
23322 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23323 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
23324 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
23325 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
23326 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
23327 ALo = DAG.getBitcast(ExtVT, ALo);
23328 AHi = DAG.getBitcast(ExtVT, AHi);
23329 RLo = DAG.getBitcast(ExtVT, RLo);
23330 RHi = DAG.getBitcast(ExtVT, RHi);
23331 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
23332 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
23333 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
23334 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
23335 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23338 if (VT == MVT::v8i16) {
23339 unsigned ShiftOpcode = Op->getOpcode();
23341 // If we have a constant shift amount, the non-SSE41 path is best as
23342 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
23343 bool UseSSE41 = Subtarget.hasSSE41() &&
23344 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23346 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
23347 // On SSE41 targets we make use of the fact that VSELECT lowers
23348 // to PBLENDVB which selects bytes based just on the sign bit.
23350 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
23351 V0 = DAG.getBitcast(ExtVT, V0);
23352 V1 = DAG.getBitcast(ExtVT, V1);
23353 Sel = DAG.getBitcast(ExtVT, Sel);
23354 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
23356 // On pre-SSE41 targets we splat the sign bit - a negative value will
23357 // set all bits of the lanes to true and VSELECT uses that in
23358 // its OR(AND(V0,C),AND(V1,~C)) lowering.
23360 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
23361 return DAG.getSelect(dl, VT, C, V0, V1);
23364 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
23366 // On SSE41 targets we need to replicate the shift mask in both
23367 // bytes for PBLENDVB.
23370 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
23371 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
23373 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
23376 // r = VSELECT(r, shift(r, 8), a);
23377 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
23378 R = SignBitSelect(Amt, M, R);
23381 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23383 // r = VSELECT(r, shift(r, 4), a);
23384 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23385 R = SignBitSelect(Amt, M, R);
23388 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23390 // r = VSELECT(r, shift(r, 2), a);
23391 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23392 R = SignBitSelect(Amt, M, R);
23395 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23397 // return VSELECT(r, shift(r, 1), a);
23398 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23399 R = SignBitSelect(Amt, M, R);
23403 // Decompose 256-bit shifts into smaller 128-bit shifts.
23404 if (VT.is256BitVector())
23405 return Lower256IntArith(Op, DAG);
23410 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
23411 SelectionDAG &DAG) {
23412 MVT VT = Op.getSimpleValueType();
23414 SDValue R = Op.getOperand(0);
23415 SDValue Amt = Op.getOperand(1);
23416 unsigned Opcode = Op.getOpcode();
23417 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23419 if (Subtarget.hasAVX512()) {
23420 // Attempt to rotate by immediate.
23422 SmallVector<APInt, 16> EltBits;
23423 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
23424 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
23425 return EltBits[0] == V;
23427 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
23428 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
23429 return DAG.getNode(Op, DL, VT, R,
23430 DAG.getConstant(RotateAmt, DL, MVT::i8));
23434 // Else, fall-back on VPROLV/VPRORV.
23438 assert(VT.isVector() && "Custom lowering only for vector rotates!");
23439 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
23440 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
23442 // XOP has 128-bit vector variable + immediate rotates.
23443 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
23445 // Split 256-bit integers.
23446 if (VT.is256BitVector())
23447 return Lower256IntArith(Op, DAG);
23449 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
23451 // Attempt to rotate by immediate.
23452 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23453 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23454 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23455 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23456 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
23457 DAG.getConstant(RotateAmt, DL, MVT::i8));
23461 // Use general rotate by variable (per-element).
23465 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23466 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23467 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23468 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23469 // has only one use.
23470 SDNode *N = Op.getNode();
23471 SDValue LHS = N->getOperand(0);
23472 SDValue RHS = N->getOperand(1);
23473 unsigned BaseOp = 0;
23474 X86::CondCode Cond;
23476 switch (Op.getOpcode()) {
23477 default: llvm_unreachable("Unknown ovf instruction!");
23479 // A subtract of one will be selected as a INC. Note that INC doesn't
23480 // set CF, so we can't do this for UADDO.
23481 if (isOneConstant(RHS)) {
23482 BaseOp = X86ISD::INC;
23483 Cond = X86::COND_O;
23486 BaseOp = X86ISD::ADD;
23487 Cond = X86::COND_O;
23490 BaseOp = X86ISD::ADD;
23491 Cond = X86::COND_B;
23494 // A subtract of one will be selected as a DEC. Note that DEC doesn't
23495 // set CF, so we can't do this for USUBO.
23496 if (isOneConstant(RHS)) {
23497 BaseOp = X86ISD::DEC;
23498 Cond = X86::COND_O;
23501 BaseOp = X86ISD::SUB;
23502 Cond = X86::COND_O;
23505 BaseOp = X86ISD::SUB;
23506 Cond = X86::COND_B;
23509 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
23510 Cond = X86::COND_O;
23512 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
23513 if (N->getValueType(0) == MVT::i8) {
23514 BaseOp = X86ISD::UMUL8;
23515 Cond = X86::COND_O;
23518 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
23520 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
23522 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
23524 if (N->getValueType(1) == MVT::i1)
23525 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23527 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23531 // Also sets EFLAGS.
23532 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
23533 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23535 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
23537 if (N->getValueType(1) == MVT::i1)
23538 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23540 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23543 /// Returns true if the operand type is exactly twice the native width, and
23544 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
23545 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
23546 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
23547 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
23548 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
23551 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
23552 else if (OpWidth == 128)
23553 return Subtarget.hasCmpxchg16b();
23558 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
23559 return needsCmpXchgNb(SI->getValueOperand()->getType());
23562 // Note: this turns large loads into lock cmpxchg8b/16b.
23563 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
23564 TargetLowering::AtomicExpansionKind
23565 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
23566 auto PTy = cast<PointerType>(LI->getPointerOperandType());
23567 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
23568 : AtomicExpansionKind::None;
23571 TargetLowering::AtomicExpansionKind
23572 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
23573 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23574 Type *MemType = AI->getType();
23576 // If the operand is too big, we must see if cmpxchg8/16b is available
23577 // and default to library calls otherwise.
23578 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
23579 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
23580 : AtomicExpansionKind::None;
23583 AtomicRMWInst::BinOp Op = AI->getOperation();
23586 llvm_unreachable("Unknown atomic operation");
23587 case AtomicRMWInst::Xchg:
23588 case AtomicRMWInst::Add:
23589 case AtomicRMWInst::Sub:
23590 // It's better to use xadd, xsub or xchg for these in all cases.
23591 return AtomicExpansionKind::None;
23592 case AtomicRMWInst::Or:
23593 case AtomicRMWInst::And:
23594 case AtomicRMWInst::Xor:
23595 // If the atomicrmw's result isn't actually used, we can just add a "lock"
23596 // prefix to a normal instruction for these operations.
23597 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
23598 : AtomicExpansionKind::None;
23599 case AtomicRMWInst::Nand:
23600 case AtomicRMWInst::Max:
23601 case AtomicRMWInst::Min:
23602 case AtomicRMWInst::UMax:
23603 case AtomicRMWInst::UMin:
23604 // These always require a non-trivial set of data operations on x86. We must
23605 // use a cmpxchg loop.
23606 return AtomicExpansionKind::CmpXChg;
23611 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
23612 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23613 Type *MemType = AI->getType();
23614 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
23615 // there is no benefit in turning such RMWs into loads, and it is actually
23616 // harmful as it introduces a mfence.
23617 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
23620 auto Builder = IRBuilder<>(AI);
23621 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
23622 auto SSID = AI->getSyncScopeID();
23623 // We must restrict the ordering to avoid generating loads with Release or
23624 // ReleaseAcquire orderings.
23625 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
23626 auto Ptr = AI->getPointerOperand();
23628 // Before the load we need a fence. Here is an example lifted from
23629 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
23632 // x.store(1, relaxed);
23633 // r1 = y.fetch_add(0, release);
23635 // y.fetch_add(42, acquire);
23636 // r2 = x.load(relaxed);
23637 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
23638 // lowered to just a load without a fence. A mfence flushes the store buffer,
23639 // making the optimization clearly correct.
23640 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
23641 // otherwise, we might be able to be more aggressive on relaxed idempotent
23642 // rmw. In practice, they do not look useful, so we don't try to be
23643 // especially clever.
23644 if (SSID == SyncScope::SingleThread)
23645 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
23646 // the IR level, so we must wrap it in an intrinsic.
23649 if (!Subtarget.hasMFence())
23650 // FIXME: it might make sense to use a locked operation here but on a
23651 // different cache-line to prevent cache-line bouncing. In practice it
23652 // is probably a small win, and x86 processors without mfence are rare
23653 // enough that we do not bother.
23657 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
23658 Builder.CreateCall(MFence, {});
23660 // Finally we can emit the atomic load.
23661 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
23662 AI->getType()->getPrimitiveSizeInBits());
23663 Loaded->setAtomic(Order, SSID);
23664 AI->replaceAllUsesWith(Loaded);
23665 AI->eraseFromParent();
23669 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
23670 SelectionDAG &DAG) {
23672 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
23673 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
23674 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
23675 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
23677 // The only fence that needs an instruction is a sequentially-consistent
23678 // cross-thread fence.
23679 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
23680 FenceSSID == SyncScope::System) {
23681 if (Subtarget.hasMFence())
23682 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
23684 SDValue Chain = Op.getOperand(0);
23685 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
23687 DAG.getRegister(X86::ESP, MVT::i32), // Base
23688 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
23689 DAG.getRegister(0, MVT::i32), // Index
23690 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
23691 DAG.getRegister(0, MVT::i32), // Segment.
23695 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
23696 return SDValue(Res, 0);
23699 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
23700 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
23703 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
23704 SelectionDAG &DAG) {
23705 MVT T = Op.getSimpleValueType();
23709 switch(T.SimpleTy) {
23710 default: llvm_unreachable("Invalid value type!");
23711 case MVT::i8: Reg = X86::AL; size = 1; break;
23712 case MVT::i16: Reg = X86::AX; size = 2; break;
23713 case MVT::i32: Reg = X86::EAX; size = 4; break;
23715 assert(Subtarget.is64Bit() && "Node not type legal!");
23716 Reg = X86::RAX; size = 8;
23719 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
23720 Op.getOperand(2), SDValue());
23721 SDValue Ops[] = { cpIn.getValue(0),
23724 DAG.getTargetConstant(size, DL, MVT::i8),
23725 cpIn.getValue(1) };
23726 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23727 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
23728 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
23732 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
23733 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
23734 MVT::i32, cpOut.getValue(2));
23735 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
23737 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
23738 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
23739 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
23743 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
23744 SelectionDAG &DAG) {
23745 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
23746 MVT DstVT = Op.getSimpleValueType();
23748 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
23749 SrcVT == MVT::i64) {
23750 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23751 if (DstVT != MVT::f64)
23752 // This conversion needs to be expanded.
23755 SDValue Op0 = Op->getOperand(0);
23756 SmallVector<SDValue, 16> Elts;
23760 if (SrcVT.isVector()) {
23761 NumElts = SrcVT.getVectorNumElements();
23762 SVT = SrcVT.getVectorElementType();
23764 // Widen the vector in input in the case of MVT::v2i32.
23765 // Example: from MVT::v2i32 to MVT::v4i32.
23766 for (unsigned i = 0, e = NumElts; i != e; ++i)
23767 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
23768 DAG.getIntPtrConstant(i, dl)));
23770 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
23771 "Unexpected source type in LowerBITCAST");
23772 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23773 DAG.getIntPtrConstant(0, dl)));
23774 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23775 DAG.getIntPtrConstant(1, dl)));
23779 // Explicitly mark the extra elements as Undef.
23780 Elts.append(NumElts, DAG.getUNDEF(SVT));
23782 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23783 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
23784 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
23785 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
23786 DAG.getIntPtrConstant(0, dl));
23789 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
23790 Subtarget.hasMMX() && "Unexpected custom BITCAST");
23791 assert((DstVT == MVT::i64 ||
23792 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
23793 "Unexpected custom BITCAST");
23794 // i64 <=> MMX conversions are Legal.
23795 if (SrcVT==MVT::i64 && DstVT.isVector())
23797 if (DstVT==MVT::i64 && SrcVT.isVector())
23799 // MMX <=> MMX conversions are Legal.
23800 if (SrcVT.isVector() && DstVT.isVector())
23802 // All other conversions need to be expanded.
23806 /// Compute the horizontal sum of bytes in V for the elements of VT.
23808 /// Requires V to be a byte vector and VT to be an integer vector type with
23809 /// wider elements than V's type. The width of the elements of VT determines
23810 /// how many bytes of V are summed horizontally to produce each element of the
23812 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23813 const X86Subtarget &Subtarget,
23814 SelectionDAG &DAG) {
23816 MVT ByteVecVT = V.getSimpleValueType();
23817 MVT EltVT = VT.getVectorElementType();
23818 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
23819 "Expected value to have byte element type.");
23820 assert(EltVT != MVT::i8 &&
23821 "Horizontal byte sum only makes sense for wider elements!");
23822 unsigned VecSize = VT.getSizeInBits();
23823 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
23825 // PSADBW instruction horizontally add all bytes and leave the result in i64
23826 // chunks, thus directly computes the pop count for v2i64 and v4i64.
23827 if (EltVT == MVT::i64) {
23828 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23829 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23830 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23831 return DAG.getBitcast(VT, V);
23834 if (EltVT == MVT::i32) {
23835 // We unpack the low half and high half into i32s interleaved with zeros so
23836 // that we can use PSADBW to horizontally sum them. The most useful part of
23837 // this is that it lines up the results of two PSADBW instructions to be
23838 // two v2i64 vectors which concatenated are the 4 population counts. We can
23839 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23840 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23841 SDValue V32 = DAG.getBitcast(VT, V);
23842 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23843 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23845 // Do the horizontal sums into two v2i64s.
23846 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23847 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23848 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23849 DAG.getBitcast(ByteVecVT, Low), Zeros);
23850 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23851 DAG.getBitcast(ByteVecVT, High), Zeros);
23853 // Merge them together.
23854 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23855 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23856 DAG.getBitcast(ShortVecVT, Low),
23857 DAG.getBitcast(ShortVecVT, High));
23859 return DAG.getBitcast(VT, V);
23862 // The only element type left is i16.
23863 assert(EltVT == MVT::i16 && "Unknown how to handle type");
23865 // To obtain pop count for each i16 element starting from the pop count for
23866 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23867 // right by 8. It is important to shift as i16s as i8 vector shift isn't
23868 // directly supported.
23869 SDValue ShifterV = DAG.getConstant(8, DL, VT);
23870 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23871 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23872 DAG.getBitcast(ByteVecVT, V));
23873 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23876 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23877 const X86Subtarget &Subtarget,
23878 SelectionDAG &DAG) {
23879 MVT VT = Op.getSimpleValueType();
23880 MVT EltVT = VT.getVectorElementType();
23881 unsigned VecSize = VT.getSizeInBits();
23883 // Implement a lookup table in register by using an algorithm based on:
23884 // http://wm.ite.pl/articles/sse-popcount.html
23886 // The general idea is that every lower byte nibble in the input vector is an
23887 // index into a in-register pre-computed pop count table. We then split up the
23888 // input vector in two new ones: (1) a vector with only the shifted-right
23889 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
23890 // masked out higher ones) for each byte. PSHUFB is used separately with both
23891 // to index the in-register table. Next, both are added and the result is a
23892 // i8 vector where each element contains the pop count for input byte.
23894 // To obtain the pop count for elements != i8, we follow up with the same
23895 // approach and use additional tricks as described below.
23897 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
23898 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
23899 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
23900 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
23902 int NumByteElts = VecSize / 8;
23903 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23904 SDValue In = DAG.getBitcast(ByteVecVT, Op);
23905 SmallVector<SDValue, 64> LUTVec;
23906 for (int i = 0; i < NumByteElts; ++i)
23907 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23908 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23909 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23912 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23913 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23916 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23918 // The input vector is used as the shuffle mask that index elements into the
23919 // LUT. After counting low and high nibbles, add the vector to obtain the
23920 // final pop count per i8 element.
23921 SDValue HighPopCnt =
23922 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23923 SDValue LowPopCnt =
23924 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23925 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23927 if (EltVT == MVT::i8)
23930 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23933 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23934 const X86Subtarget &Subtarget,
23935 SelectionDAG &DAG) {
23936 MVT VT = Op.getSimpleValueType();
23937 assert(VT.is128BitVector() &&
23938 "Only 128-bit vector bitmath lowering supported.");
23940 int VecSize = VT.getSizeInBits();
23941 MVT EltVT = VT.getVectorElementType();
23942 int Len = EltVT.getSizeInBits();
23944 // This is the vectorized version of the "best" algorithm from
23945 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23946 // with a minor tweak to use a series of adds + shifts instead of vector
23947 // multiplications. Implemented for all integer vector types. We only use
23948 // this when we don't have SSSE3 which allows a LUT-based lowering that is
23949 // much faster, even faster than using native popcnt instructions.
23951 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23952 MVT VT = V.getSimpleValueType();
23953 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23954 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23956 auto GetMask = [&](SDValue V, APInt Mask) {
23957 MVT VT = V.getSimpleValueType();
23958 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23959 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23962 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23963 // x86, so set the SRL type to have elements at least i16 wide. This is
23964 // correct because all of our SRLs are followed immediately by a mask anyways
23965 // that handles any bits that sneak into the high bits of the byte elements.
23966 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23970 // v = v - ((v >> 1) & 0x55555555...)
23972 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23973 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23974 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23976 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23977 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23978 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23979 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23980 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23982 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23983 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23984 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23985 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23987 // At this point, V contains the byte-wise population count, and we are
23988 // merely doing a horizontal sum if necessary to get the wider element
23990 if (EltVT == MVT::i8)
23993 return LowerHorizontalByteSum(
23994 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23998 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23999 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
24000 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24001 SelectionDAG &DAG) {
24002 MVT VT = Op.getSimpleValueType();
24003 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
24004 "Unknown CTPOP type to handle");
24005 SDLoc DL(Op.getNode());
24006 SDValue Op0 = Op.getOperand(0);
24008 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
24009 if (Subtarget.hasVPOPCNTDQ()) {
24010 unsigned NumElems = VT.getVectorNumElements();
24011 assert((VT.getVectorElementType() == MVT::i8 ||
24012 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
24013 if (NumElems <= 16) {
24014 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
24015 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
24016 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
24017 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
24021 if (!Subtarget.hasSSSE3()) {
24022 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
24023 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
24024 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
24027 // Decompose 256-bit ops into smaller 128-bit ops.
24028 if (VT.is256BitVector() && !Subtarget.hasInt256())
24029 return Lower256IntUnary(Op, DAG);
24031 // Decompose 512-bit ops into smaller 256-bit ops.
24032 if (VT.is512BitVector() && !Subtarget.hasBWI())
24033 return Lower512IntUnary(Op, DAG);
24035 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
24038 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24039 SelectionDAG &DAG) {
24040 assert(Op.getSimpleValueType().isVector() &&
24041 "We only do custom lowering for vector population count.");
24042 return LowerVectorCTPOP(Op, Subtarget, DAG);
24045 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
24046 MVT VT = Op.getSimpleValueType();
24047 SDValue In = Op.getOperand(0);
24050 // For scalars, its still beneficial to transfer to/from the SIMD unit to
24051 // perform the BITREVERSE.
24052 if (!VT.isVector()) {
24053 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
24054 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
24055 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
24056 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
24057 DAG.getIntPtrConstant(0, DL));
24060 int NumElts = VT.getVectorNumElements();
24061 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
24063 // Decompose 256-bit ops into smaller 128-bit ops.
24064 if (VT.is256BitVector())
24065 return Lower256IntUnary(Op, DAG);
24067 assert(VT.is128BitVector() &&
24068 "Only 128-bit vector bitreverse lowering supported.");
24070 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
24071 // perform the BSWAP in the shuffle.
24072 // Its best to shuffle using the second operand as this will implicitly allow
24073 // memory folding for multiple vectors.
24074 SmallVector<SDValue, 16> MaskElts;
24075 for (int i = 0; i != NumElts; ++i) {
24076 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
24077 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
24078 int PermuteByte = SourceByte | (2 << 5);
24079 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
24083 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
24084 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
24085 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
24087 return DAG.getBitcast(VT, Res);
24090 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
24091 SelectionDAG &DAG) {
24092 MVT VT = Op.getSimpleValueType();
24094 if (Subtarget.hasXOP() && !VT.is512BitVector())
24095 return LowerBITREVERSE_XOP(Op, DAG);
24097 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
24099 SDValue In = Op.getOperand(0);
24102 unsigned NumElts = VT.getVectorNumElements();
24103 assert(VT.getScalarType() == MVT::i8 &&
24104 "Only byte vector BITREVERSE supported");
24106 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
24107 if (VT.is256BitVector() && !Subtarget.hasInt256())
24108 return Lower256IntUnary(Op, DAG);
24110 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
24111 // two nibbles and a PSHUFB lookup to find the bitreverse of each
24112 // 0-15 value (moved to the other nibble).
24113 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
24114 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
24115 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
24117 const int LoLUT[16] = {
24118 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
24119 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
24120 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
24121 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
24122 const int HiLUT[16] = {
24123 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
24124 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
24125 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
24126 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
24128 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
24129 for (unsigned i = 0; i < NumElts; ++i) {
24130 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
24131 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
24134 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
24135 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
24136 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
24137 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
24138 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
24141 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
24142 const X86Subtarget &Subtarget,
24143 bool AllowIncDec = true) {
24144 unsigned NewOpc = 0;
24145 switch (N->getOpcode()) {
24146 case ISD::ATOMIC_LOAD_ADD:
24147 NewOpc = X86ISD::LADD;
24149 case ISD::ATOMIC_LOAD_SUB:
24150 NewOpc = X86ISD::LSUB;
24152 case ISD::ATOMIC_LOAD_OR:
24153 NewOpc = X86ISD::LOR;
24155 case ISD::ATOMIC_LOAD_XOR:
24156 NewOpc = X86ISD::LXOR;
24158 case ISD::ATOMIC_LOAD_AND:
24159 NewOpc = X86ISD::LAND;
24162 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
24165 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
24167 if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
24168 // Convert to inc/dec if they aren't slow or we are optimizing for size.
24169 if (AllowIncDec && (!Subtarget.slowIncDec() ||
24170 DAG.getMachineFunction().getFunction().optForSize())) {
24171 if ((NewOpc == X86ISD::LADD && C->isOne()) ||
24172 (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
24173 return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
24174 DAG.getVTList(MVT::i32, MVT::Other),
24175 {N->getOperand(0), N->getOperand(1)},
24176 /*MemVT=*/N->getSimpleValueType(0), MMO);
24177 if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
24178 (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
24179 return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
24180 DAG.getVTList(MVT::i32, MVT::Other),
24181 {N->getOperand(0), N->getOperand(1)},
24182 /*MemVT=*/N->getSimpleValueType(0), MMO);
24186 return DAG.getMemIntrinsicNode(
24187 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
24188 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
24189 /*MemVT=*/N->getSimpleValueType(0), MMO);
24192 /// Lower atomic_load_ops into LOCK-prefixed operations.
24193 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
24194 const X86Subtarget &Subtarget) {
24195 SDValue Chain = N->getOperand(0);
24196 SDValue LHS = N->getOperand(1);
24197 SDValue RHS = N->getOperand(2);
24198 unsigned Opc = N->getOpcode();
24199 MVT VT = N->getSimpleValueType(0);
24202 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
24203 // can only be lowered when the result is unused. They should have already
24204 // been transformed into a cmpxchg loop in AtomicExpand.
24205 if (N->hasAnyUseOfValue(0)) {
24206 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
24207 // select LXADD if LOCK_SUB can't be selected.
24208 if (Opc == ISD::ATOMIC_LOAD_SUB) {
24209 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
24210 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
24211 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
24212 RHS, AN->getMemOperand());
24214 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
24215 "Used AtomicRMW ops other than Add should have been expanded!");
24219 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
24220 // RAUW the chain, but don't worry about the result, as it's unused.
24221 assert(!N->hasAnyUseOfValue(0));
24222 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
24226 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
24227 SDNode *Node = Op.getNode();
24229 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
24231 // Convert seq_cst store -> xchg
24232 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
24233 // FIXME: On 32-bit, store -> fist or movq would be more efficient
24234 // (The only way to get a 16-byte store is cmpxchg16b)
24235 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
24236 if (cast<AtomicSDNode>(Node)->getOrdering() ==
24237 AtomicOrdering::SequentiallyConsistent ||
24238 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
24239 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
24240 cast<AtomicSDNode>(Node)->getMemoryVT(),
24241 Node->getOperand(0),
24242 Node->getOperand(1), Node->getOperand(2),
24243 cast<AtomicSDNode>(Node)->getMemOperand());
24244 return Swap.getValue(1);
24246 // Other atomic stores have a simple pattern.
24250 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
24251 SDNode *N = Op.getNode();
24252 MVT VT = N->getSimpleValueType(0);
24254 // Let legalize expand this if it isn't a legal type yet.
24255 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
24258 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
24261 // Set the carry flag.
24262 SDValue Carry = Op.getOperand(2);
24263 EVT CarryVT = Carry.getValueType();
24264 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
24265 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24266 Carry, DAG.getConstant(NegOne, DL, CarryVT));
24268 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
24269 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
24270 Op.getOperand(1), Carry.getValue(1));
24272 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
24273 if (N->getValueType(1) == MVT::i1)
24274 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
24276 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24279 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
24280 SelectionDAG &DAG) {
24281 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
24283 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
24284 // which returns the values as { float, float } (in XMM0) or
24285 // { double, double } (which is returned in XMM0, XMM1).
24287 SDValue Arg = Op.getOperand(0);
24288 EVT ArgVT = Arg.getValueType();
24289 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24291 TargetLowering::ArgListTy Args;
24292 TargetLowering::ArgListEntry Entry;
24296 Entry.IsSExt = false;
24297 Entry.IsZExt = false;
24298 Args.push_back(Entry);
24300 bool isF64 = ArgVT == MVT::f64;
24301 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
24302 // the small struct {f32, f32} is returned in (eax, edx). For f64,
24303 // the results are returned via SRet in memory.
24304 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24305 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
24306 const char *LibcallName = TLI.getLibcallName(LC);
24308 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
24310 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
24311 : (Type *)VectorType::get(ArgTy, 4);
24313 TargetLowering::CallLoweringInfo CLI(DAG);
24314 CLI.setDebugLoc(dl)
24315 .setChain(DAG.getEntryNode())
24316 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
24318 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
24321 // Returned in xmm0 and xmm1.
24322 return CallResult.first;
24324 // Returned in bits 0:31 and 32:64 xmm0.
24325 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24326 CallResult.first, DAG.getIntPtrConstant(0, dl));
24327 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24328 CallResult.first, DAG.getIntPtrConstant(1, dl));
24329 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
24330 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
24333 /// Widen a vector input to a vector of NVT. The
24334 /// input vector must have the same element type as NVT.
24335 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
24336 bool FillWithZeroes = false) {
24337 // Check if InOp already has the right width.
24338 MVT InVT = InOp.getSimpleValueType();
24342 if (InOp.isUndef())
24343 return DAG.getUNDEF(NVT);
24345 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
24346 "input and widen element type must match");
24348 unsigned InNumElts = InVT.getVectorNumElements();
24349 unsigned WidenNumElts = NVT.getVectorNumElements();
24350 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
24351 "Unexpected request for vector widening");
24354 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
24355 InOp.getNumOperands() == 2) {
24356 SDValue N1 = InOp.getOperand(1);
24357 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
24359 InOp = InOp.getOperand(0);
24360 InVT = InOp.getSimpleValueType();
24361 InNumElts = InVT.getVectorNumElements();
24364 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
24365 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
24366 SmallVector<SDValue, 16> Ops;
24367 for (unsigned i = 0; i < InNumElts; ++i)
24368 Ops.push_back(InOp.getOperand(i));
24370 EVT EltVT = InOp.getOperand(0).getValueType();
24372 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
24373 DAG.getUNDEF(EltVT);
24374 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
24375 Ops.push_back(FillVal);
24376 return DAG.getBuildVector(NVT, dl, Ops);
24378 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
24380 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
24381 InOp, DAG.getIntPtrConstant(0, dl));
24384 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
24385 SelectionDAG &DAG) {
24386 assert(Subtarget.hasAVX512() &&
24387 "MGATHER/MSCATTER are supported on AVX-512 arch only");
24389 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
24390 SDValue Src = N->getValue();
24391 MVT VT = Src.getSimpleValueType();
24392 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
24395 SDValue Index = N->getIndex();
24396 SDValue Mask = N->getMask();
24397 SDValue Chain = N->getChain();
24398 SDValue BasePtr = N->getBasePtr();
24399 MVT MemVT = N->getMemoryVT().getSimpleVT();
24400 MVT IndexVT = Index.getSimpleValueType();
24401 MVT MaskVT = Mask.getSimpleValueType();
24403 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
24404 // The v2i32 value was promoted to v2i64.
24405 // Now we "redo" the type legalizer's work and widen the original
24406 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
24408 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
24409 "Unexpected memory type");
24410 int ShuffleMask[] = {0, 2, -1, -1};
24411 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
24412 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
24413 // Now we have 4 elements instead of 2.
24414 // Expand the index.
24415 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
24416 Index = ExtendToType(Index, NewIndexVT, DAG);
24418 // Expand the mask with zeroes
24419 // Mask may be <2 x i64> or <2 x i1> at this moment
24420 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
24421 "Unexpected mask type");
24422 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
24423 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
24427 unsigned NumElts = VT.getVectorNumElements();
24428 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
24429 !Index.getSimpleValueType().is512BitVector()) {
24430 // AVX512F supports only 512-bit vectors. Or data or index should
24431 // be 512 bit wide. If now the both index and data are 256-bit, but
24432 // the vector contains 8 elements, we just sign-extend the index
24433 if (IndexVT == MVT::v8i32)
24434 // Just extend index
24435 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24437 // The minimal number of elts in scatter is 8
24440 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
24441 // Use original index here, do not modify the index twice
24442 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
24443 if (IndexVT.getScalarType() == MVT::i32)
24444 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24447 // At this point we have promoted mask operand
24448 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
24449 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
24450 // Use the original mask here, do not modify the mask twice
24451 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
24453 // The value that should be stored
24454 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
24455 Src = ExtendToType(Src, NewVT, DAG);
24458 // If the mask is "wide" at this point - truncate it to i1 vector
24459 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
24460 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
24462 // The mask is killed by scatter, add it to the values
24463 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
24464 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
24465 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24466 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24467 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24468 return SDValue(NewScatter.getNode(), 1);
24471 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
24472 SelectionDAG &DAG) {
24474 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
24475 MVT VT = Op.getSimpleValueType();
24476 MVT ScalarVT = VT.getScalarType();
24477 SDValue Mask = N->getMask();
24480 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
24481 "Expanding masked load is supported on AVX-512 target only!");
24483 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
24484 "Expanding masked load is supported for 32 and 64-bit types only!");
24486 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
24487 // VLX. These types for exp-loads are handled here.
24488 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
24491 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24492 "Cannot lower masked load op.");
24494 assert((ScalarVT.getSizeInBits() >= 32 ||
24495 (Subtarget.hasBWI() &&
24496 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
24497 "Unsupported masked load op.");
24499 // This operation is legal for targets with VLX, but without
24500 // VLX the vector should be widened to 512 bit
24501 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
24502 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24503 SDValue Src0 = N->getSrc0();
24504 Src0 = ExtendToType(Src0, WideDataVT, DAG);
24506 // Mask element has to be i1.
24507 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
24508 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
24509 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
24511 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
24513 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24514 if (MaskEltTy != MVT::i1)
24515 Mask = DAG.getNode(ISD::TRUNCATE, dl,
24516 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
24517 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
24518 N->getBasePtr(), Mask, Src0,
24519 N->getMemoryVT(), N->getMemOperand(),
24520 N->getExtensionType(),
24521 N->isExpandingLoad());
24523 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
24524 NewLoad.getValue(0),
24525 DAG.getIntPtrConstant(0, dl));
24526 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
24527 return DAG.getMergeValues(RetOps, dl);
24530 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
24531 SelectionDAG &DAG) {
24532 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
24533 SDValue DataToStore = N->getValue();
24534 MVT VT = DataToStore.getSimpleValueType();
24535 MVT ScalarVT = VT.getScalarType();
24536 SDValue Mask = N->getMask();
24539 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
24540 "Expanding masked load is supported on AVX-512 target only!");
24542 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
24543 "Expanding masked load is supported for 32 and 64-bit types only!");
24545 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
24546 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
24549 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24550 "Cannot lower masked store op.");
24552 assert((ScalarVT.getSizeInBits() >= 32 ||
24553 (Subtarget.hasBWI() &&
24554 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
24555 "Unsupported masked store op.");
24557 // This operation is legal for targets with VLX, but without
24558 // VLX the vector should be widened to 512 bit
24559 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
24560 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24562 // Mask element has to be i1.
24563 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
24564 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
24565 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
24567 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
24569 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
24570 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24571 if (MaskEltTy != MVT::i1)
24572 Mask = DAG.getNode(ISD::TRUNCATE, dl,
24573 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
24574 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
24575 Mask, N->getMemoryVT(), N->getMemOperand(),
24576 N->isTruncatingStore(), N->isCompressingStore());
24579 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
24580 SelectionDAG &DAG) {
24581 assert(Subtarget.hasAVX2() &&
24582 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
24584 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
24586 MVT VT = Op.getSimpleValueType();
24587 SDValue Index = N->getIndex();
24588 SDValue Mask = N->getMask();
24589 SDValue Src0 = N->getValue();
24590 MVT IndexVT = Index.getSimpleValueType();
24591 MVT MaskVT = Mask.getSimpleValueType();
24593 unsigned NumElts = VT.getVectorNumElements();
24594 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
24596 // If the index is v2i32, we're being called by type legalization.
24597 if (IndexVT == MVT::v2i32)
24600 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24601 !Index.getSimpleValueType().is512BitVector()) {
24602 // AVX512F supports only 512-bit vectors. Or data or index should
24603 // be 512 bit wide. If now the both index and data are 256-bit, but
24604 // the vector contains 8 elements, we just sign-extend the index
24605 if (NumElts == 8) {
24606 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24607 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24608 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24609 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24610 N->getMemOperand());
24611 return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
24614 // Minimal number of elements in Gather
24617 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
24618 Index = ExtendToType(Index, NewIndexVT, DAG);
24619 if (IndexVT.getScalarType() == MVT::i32)
24620 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24623 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
24624 // At this point we have promoted mask operand
24625 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
24626 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
24627 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
24628 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
24630 // The pass-through value
24631 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
24632 Src0 = ExtendToType(Src0, NewVT, DAG);
24634 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24635 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24636 DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24637 N->getMemOperand());
24638 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
24639 NewGather.getValue(0),
24640 DAG.getIntPtrConstant(0, dl));
24641 SDValue RetOps[] = {Extract, NewGather.getValue(2)};
24642 return DAG.getMergeValues(RetOps, dl);
24645 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24646 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24647 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24648 N->getMemOperand());
24649 return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
24652 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
24653 SelectionDAG &DAG) const {
24654 // TODO: Eventually, the lowering of these nodes should be informed by or
24655 // deferred to the GC strategy for the function in which they appear. For
24656 // now, however, they must be lowered to something. Since they are logically
24657 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24658 // require special handling for these nodes), lower them as literal NOOPs for
24660 SmallVector<SDValue, 2> Ops;
24662 Ops.push_back(Op.getOperand(0));
24663 if (Op->getGluedNode())
24664 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24667 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24668 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24673 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
24674 SelectionDAG &DAG) const {
24675 // TODO: Eventually, the lowering of these nodes should be informed by or
24676 // deferred to the GC strategy for the function in which they appear. For
24677 // now, however, they must be lowered to something. Since they are logically
24678 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24679 // require special handling for these nodes), lower them as literal NOOPs for
24681 SmallVector<SDValue, 2> Ops;
24683 Ops.push_back(Op.getOperand(0));
24684 if (Op->getGluedNode())
24685 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24688 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24689 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24694 /// Provide custom lowering hooks for some operations.
24695 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24696 switch (Op.getOpcode()) {
24697 default: llvm_unreachable("Should not custom lower this!");
24698 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
24699 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
24700 return LowerCMP_SWAP(Op, Subtarget, DAG);
24701 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
24702 case ISD::ATOMIC_LOAD_ADD:
24703 case ISD::ATOMIC_LOAD_SUB:
24704 case ISD::ATOMIC_LOAD_OR:
24705 case ISD::ATOMIC_LOAD_XOR:
24706 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
24707 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
24708 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
24709 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
24710 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
24711 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
24712 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
24713 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
24714 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
24715 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
24716 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
24717 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
24718 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
24719 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
24720 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
24721 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
24722 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
24723 case ISD::SHL_PARTS:
24724 case ISD::SRA_PARTS:
24725 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
24726 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
24727 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
24728 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
24729 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
24730 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
24731 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
24732 case ISD::ZERO_EXTEND_VECTOR_INREG:
24733 case ISD::SIGN_EXTEND_VECTOR_INREG:
24734 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
24735 case ISD::FP_TO_SINT:
24736 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
24737 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
24738 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
24740 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
24741 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
24742 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
24743 case ISD::SETCC: return LowerSETCC(Op, DAG);
24744 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
24745 case ISD::SELECT: return LowerSELECT(Op, DAG);
24746 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
24747 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
24748 case ISD::VASTART: return LowerVASTART(Op, DAG);
24749 case ISD::VAARG: return LowerVAARG(Op, DAG);
24750 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
24751 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
24752 case ISD::INTRINSIC_VOID:
24753 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24754 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
24755 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
24756 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
24757 case ISD::FRAME_TO_ARGS_OFFSET:
24758 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24759 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24760 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
24761 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
24762 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
24763 case ISD::EH_SJLJ_SETUP_DISPATCH:
24764 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
24765 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
24766 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
24767 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
24769 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
24771 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
24772 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
24774 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
24775 case ISD::UMUL_LOHI:
24776 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
24778 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
24781 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
24787 case ISD::UMULO: return LowerXALUO(Op, DAG);
24788 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
24789 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
24790 case ISD::ADDCARRY:
24791 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
24793 case ISD::SUB: return LowerADD_SUB(Op, DAG);
24797 case ISD::UMIN: return LowerMINMAX(Op, DAG);
24798 case ISD::ABS: return LowerABS(Op, DAG);
24799 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
24800 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
24801 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
24802 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
24803 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
24804 case ISD::GC_TRANSITION_START:
24805 return LowerGC_TRANSITION_START(Op, DAG);
24806 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
24807 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
24811 /// Places new result values for the node in Results (their number
24812 /// and types must exactly match those of the original return values of
24813 /// the node), or leaves Results empty, which indicates that the node is not
24814 /// to be custom lowered after all.
24815 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24816 SmallVectorImpl<SDValue> &Results,
24817 SelectionDAG &DAG) const {
24818 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24820 if (!Res.getNode())
24823 assert((N->getNumValues() <= Res->getNumValues()) &&
24824 "Lowering returned the wrong number of results!");
24826 // Places new result values base on N result number.
24827 // In some cases (LowerSINT_TO_FP for example) Res has more result values
24828 // than original node, chain should be dropped(last value).
24829 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
24830 Results.push_back(Res.getValue(I));
24833 /// Replace a node with an illegal result type with a new node built out of
24835 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24836 SmallVectorImpl<SDValue>&Results,
24837 SelectionDAG &DAG) const {
24839 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24840 switch (N->getOpcode()) {
24842 llvm_unreachable("Do not know how to custom type legalize this operation!");
24843 case X86ISD::AVG: {
24844 // Legalize types for X86ISD::AVG by expanding vectors.
24845 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24847 auto InVT = N->getValueType(0);
24848 auto InVTSize = InVT.getSizeInBits();
24849 const unsigned RegSize =
24850 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
24851 assert((Subtarget.hasBWI() || RegSize < 512) &&
24852 "512-bit vector requires AVX512BW");
24853 assert((Subtarget.hasAVX2() || RegSize < 256) &&
24854 "256-bit vector requires AVX2");
24856 auto ElemVT = InVT.getVectorElementType();
24857 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24858 RegSize / ElemVT.getSizeInBits());
24859 assert(RegSize % InVT.getSizeInBits() == 0);
24860 unsigned NumConcat = RegSize / InVT.getSizeInBits();
24862 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24863 Ops[0] = N->getOperand(0);
24864 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24865 Ops[0] = N->getOperand(1);
24866 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24868 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24869 if (!ExperimentalVectorWideningLegalization)
24870 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24871 DAG.getIntPtrConstant(0, dl));
24872 Results.push_back(Res);
24875 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24876 case X86ISD::FMINC:
24878 case X86ISD::FMAXC:
24879 case X86ISD::FMAX: {
24880 EVT VT = N->getValueType(0);
24881 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
24882 SDValue UNDEF = DAG.getUNDEF(VT);
24883 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24884 N->getOperand(0), UNDEF);
24885 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24886 N->getOperand(1), UNDEF);
24887 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24895 case ISD::UDIVREM: {
24896 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24897 Results.push_back(V);
24900 case ISD::FP_TO_SINT:
24901 case ISD::FP_TO_UINT: {
24902 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24904 if (N->getValueType(0) == MVT::v2i32) {
24905 assert((IsSigned || Subtarget.hasAVX512()) &&
24906 "Can only handle signed conversion without AVX512");
24907 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24908 SDValue Src = N->getOperand(0);
24909 if (Src.getValueType() == MVT::v2f64) {
24910 MVT ResVT = MVT::v4i32;
24911 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
24912 if (!IsSigned && !Subtarget.hasVLX()) {
24913 // Widen to 512-bits.
24914 ResVT = MVT::v8i32;
24915 Opc = ISD::FP_TO_UINT;
24916 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
24917 DAG.getUNDEF(MVT::v8f64),
24918 Src, DAG.getIntPtrConstant(0, dl));
24920 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
24921 ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32
24923 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
24924 DAG.getIntPtrConstant(0, dl));
24925 Results.push_back(Res);
24928 if (Src.getValueType() == MVT::v2f32) {
24929 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24930 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24931 DAG.getUNDEF(MVT::v2f32));
24932 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24933 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24934 if (!ExperimentalVectorWideningLegalization)
24935 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24936 Results.push_back(Res);
24940 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24941 // so early out here.
24945 std::pair<SDValue,SDValue> Vals =
24946 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24947 SDValue FIST = Vals.first, StackSlot = Vals.second;
24948 if (FIST.getNode()) {
24949 EVT VT = N->getValueType(0);
24950 // Return a load from the stack slot.
24951 if (StackSlot.getNode())
24953 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24955 Results.push_back(FIST);
24959 case ISD::SINT_TO_FP: {
24960 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24961 SDValue Src = N->getOperand(0);
24962 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24964 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24967 case ISD::UINT_TO_FP: {
24968 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24969 EVT VT = N->getValueType(0);
24970 if (VT != MVT::v2f32)
24972 SDValue Src = N->getOperand(0);
24973 EVT SrcVT = Src.getValueType();
24974 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24975 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24978 if (SrcVT != MVT::v2i32)
24980 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24982 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24983 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24984 DAG.getBitcast(MVT::v2i64, VBias));
24985 Or = DAG.getBitcast(MVT::v2f64, Or);
24986 // TODO: Are there any fast-math-flags to propagate here?
24987 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24988 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24991 case ISD::FP_ROUND: {
24992 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24994 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24995 Results.push_back(V);
24998 case ISD::FP_EXTEND: {
24999 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
25000 // No other ValueType for FP_EXTEND should reach this point.
25001 assert(N->getValueType(0) == MVT::v2f32 &&
25002 "Do not know how to legalize this Node");
25005 case ISD::INTRINSIC_W_CHAIN: {
25006 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
25008 default : llvm_unreachable("Do not know how to custom type "
25009 "legalize this intrinsic operation!");
25010 case Intrinsic::x86_rdtsc:
25011 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25013 case Intrinsic::x86_rdtscp:
25014 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
25016 case Intrinsic::x86_rdpmc:
25017 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
25019 case Intrinsic::x86_xgetbv:
25020 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
25023 case ISD::INTRINSIC_WO_CHAIN: {
25024 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
25025 Results.push_back(V);
25028 case ISD::READCYCLECOUNTER: {
25029 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25032 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
25033 EVT T = N->getValueType(0);
25034 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
25035 bool Regs64bit = T == MVT::i128;
25036 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
25037 SDValue cpInL, cpInH;
25038 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25039 DAG.getConstant(0, dl, HalfT));
25040 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25041 DAG.getConstant(1, dl, HalfT));
25042 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
25043 Regs64bit ? X86::RAX : X86::EAX,
25045 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
25046 Regs64bit ? X86::RDX : X86::EDX,
25047 cpInH, cpInL.getValue(1));
25048 SDValue swapInL, swapInH;
25049 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25050 DAG.getConstant(0, dl, HalfT));
25051 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25052 DAG.getConstant(1, dl, HalfT));
25054 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
25055 swapInH, cpInH.getValue(1));
25056 // If the current function needs the base pointer, RBX,
25057 // we shouldn't use cmpxchg directly.
25058 // Indeed the lowering of that instruction will clobber
25059 // that register and since RBX will be a reserved register
25060 // the register allocator will not make sure its value will
25061 // be properly saved and restored around this live-range.
25062 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
25064 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
25065 unsigned BasePtr = TRI->getBaseRegister();
25066 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
25067 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
25068 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
25069 // ISel prefers the LCMPXCHG64 variant.
25070 // If that assert breaks, that means it is not the case anymore,
25071 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
25072 // not just EBX. This is a matter of accepting i64 input for that
25073 // pseudo, and restoring into the register of the right wide
25074 // in expand pseudo. Everything else should just work.
25075 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
25076 "Saving only half of the RBX");
25077 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
25078 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
25079 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
25080 Regs64bit ? X86::RBX : X86::EBX,
25081 HalfT, swapInH.getValue(1));
25082 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
25084 /*Glue*/ RBXSave.getValue(2)};
25085 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25088 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
25089 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
25090 Regs64bit ? X86::RBX : X86::EBX, swapInL,
25091 swapInH.getValue(1));
25092 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
25093 swapInL.getValue(1)};
25094 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25096 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
25097 Regs64bit ? X86::RAX : X86::EAX,
25098 HalfT, Result.getValue(1));
25099 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
25100 Regs64bit ? X86::RDX : X86::EDX,
25101 HalfT, cpOutL.getValue(2));
25102 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
25104 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
25105 MVT::i32, cpOutH.getValue(2));
25106 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
25107 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
25109 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
25110 Results.push_back(Success);
25111 Results.push_back(EFLAGS.getValue(1));
25114 case ISD::ATOMIC_SWAP:
25115 case ISD::ATOMIC_LOAD_ADD:
25116 case ISD::ATOMIC_LOAD_SUB:
25117 case ISD::ATOMIC_LOAD_AND:
25118 case ISD::ATOMIC_LOAD_OR:
25119 case ISD::ATOMIC_LOAD_XOR:
25120 case ISD::ATOMIC_LOAD_NAND:
25121 case ISD::ATOMIC_LOAD_MIN:
25122 case ISD::ATOMIC_LOAD_MAX:
25123 case ISD::ATOMIC_LOAD_UMIN:
25124 case ISD::ATOMIC_LOAD_UMAX:
25125 case ISD::ATOMIC_LOAD: {
25126 // Delegate to generic TypeLegalization. Situations we can really handle
25127 // should have already been dealt with by AtomicExpandPass.cpp.
25130 case ISD::BITCAST: {
25131 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25132 EVT DstVT = N->getValueType(0);
25133 EVT SrcVT = N->getOperand(0).getValueType();
25135 if (SrcVT != MVT::f64 ||
25136 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
25139 unsigned NumElts = DstVT.getVectorNumElements();
25140 EVT SVT = DstVT.getVectorElementType();
25141 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
25142 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
25143 MVT::v2f64, N->getOperand(0));
25144 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
25146 if (ExperimentalVectorWideningLegalization) {
25147 // If we are legalizing vectors by widening, we already have the desired
25148 // legal vector type, just return it.
25149 Results.push_back(ToVecInt);
25153 SmallVector<SDValue, 8> Elts;
25154 for (unsigned i = 0, e = NumElts; i != e; ++i)
25155 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
25156 ToVecInt, DAG.getIntPtrConstant(i, dl)));
25158 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
25161 case ISD::MGATHER: {
25162 EVT VT = N->getValueType(0);
25163 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25164 auto *Gather = cast<MaskedGatherSDNode>(N);
25165 SDValue Index = Gather->getIndex();
25166 if (Index.getValueType() != MVT::v2i64)
25168 SDValue Mask = Gather->getMask();
25169 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25170 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25171 Gather->getValue(),
25172 DAG.getUNDEF(MVT::v2f32));
25173 if (!Subtarget.hasVLX()) {
25174 // We need to widen the mask, but the instruction will only use 2
25175 // of its elements. So we can use undef.
25176 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25177 DAG.getUNDEF(MVT::v2i1));
25178 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25180 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25182 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25183 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
25184 Gather->getMemoryVT(), Gather->getMemOperand());
25185 Results.push_back(Res);
25186 Results.push_back(Res.getValue(2));
25189 if (VT == MVT::v2i32) {
25190 auto *Gather = cast<MaskedGatherSDNode>(N);
25191 SDValue Index = Gather->getIndex();
25192 SDValue Mask = Gather->getMask();
25193 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25194 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
25195 Gather->getValue(),
25196 DAG.getUNDEF(MVT::v2i32));
25197 // If the index is v2i64 we can use it directly.
25198 if (Index.getValueType() == MVT::v2i64 &&
25199 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25200 if (!Subtarget.hasVLX()) {
25201 // We need to widen the mask, but the instruction will only use 2
25202 // of its elements. So we can use undef.
25203 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25204 DAG.getUNDEF(MVT::v2i1));
25205 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25207 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25209 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25210 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
25211 Gather->getMemoryVT(), Gather->getMemOperand());
25212 SDValue Chain = Res.getValue(2);
25213 if (!ExperimentalVectorWideningLegalization)
25214 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25215 DAG.getIntPtrConstant(0, dl));
25216 Results.push_back(Res);
25217 Results.push_back(Chain);
25220 EVT IndexVT = Index.getValueType();
25221 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
25222 IndexVT.getScalarType(), 4);
25223 // Otherwise we need to custom widen everything to avoid promotion.
25224 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25225 DAG.getUNDEF(IndexVT));
25226 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25227 DAG.getConstant(0, dl, MVT::v2i1));
25228 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25230 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
25231 Gather->getMemoryVT(), dl, Ops,
25232 Gather->getMemOperand());
25233 SDValue Chain = Res.getValue(1);
25234 if (!ExperimentalVectorWideningLegalization)
25235 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25236 DAG.getIntPtrConstant(0, dl));
25237 Results.push_back(Res);
25238 Results.push_back(Chain);
25246 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
25247 switch ((X86ISD::NodeType)Opcode) {
25248 case X86ISD::FIRST_NUMBER: break;
25249 case X86ISD::BSF: return "X86ISD::BSF";
25250 case X86ISD::BSR: return "X86ISD::BSR";
25251 case X86ISD::SHLD: return "X86ISD::SHLD";
25252 case X86ISD::SHRD: return "X86ISD::SHRD";
25253 case X86ISD::FAND: return "X86ISD::FAND";
25254 case X86ISD::FANDN: return "X86ISD::FANDN";
25255 case X86ISD::FOR: return "X86ISD::FOR";
25256 case X86ISD::FXOR: return "X86ISD::FXOR";
25257 case X86ISD::FILD: return "X86ISD::FILD";
25258 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
25259 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
25260 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
25261 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
25262 case X86ISD::FLD: return "X86ISD::FLD";
25263 case X86ISD::FST: return "X86ISD::FST";
25264 case X86ISD::CALL: return "X86ISD::CALL";
25265 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
25266 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
25267 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
25268 case X86ISD::BT: return "X86ISD::BT";
25269 case X86ISD::CMP: return "X86ISD::CMP";
25270 case X86ISD::COMI: return "X86ISD::COMI";
25271 case X86ISD::UCOMI: return "X86ISD::UCOMI";
25272 case X86ISD::CMPM: return "X86ISD::CMPM";
25273 case X86ISD::CMPMU: return "X86ISD::CMPMU";
25274 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
25275 case X86ISD::SETCC: return "X86ISD::SETCC";
25276 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
25277 case X86ISD::FSETCC: return "X86ISD::FSETCC";
25278 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
25279 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
25280 case X86ISD::CMOV: return "X86ISD::CMOV";
25281 case X86ISD::BRCOND: return "X86ISD::BRCOND";
25282 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
25283 case X86ISD::IRET: return "X86ISD::IRET";
25284 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
25285 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
25286 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
25287 case X86ISD::Wrapper: return "X86ISD::Wrapper";
25288 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
25289 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
25290 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
25291 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
25292 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
25293 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
25294 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
25295 case X86ISD::PINSRB: return "X86ISD::PINSRB";
25296 case X86ISD::PINSRW: return "X86ISD::PINSRW";
25297 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
25298 case X86ISD::ANDNP: return "X86ISD::ANDNP";
25299 case X86ISD::BLENDI: return "X86ISD::BLENDI";
25300 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
25301 case X86ISD::ADDUS: return "X86ISD::ADDUS";
25302 case X86ISD::SUBUS: return "X86ISD::SUBUS";
25303 case X86ISD::HADD: return "X86ISD::HADD";
25304 case X86ISD::HSUB: return "X86ISD::HSUB";
25305 case X86ISD::FHADD: return "X86ISD::FHADD";
25306 case X86ISD::FHSUB: return "X86ISD::FHSUB";
25307 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
25308 case X86ISD::FMAX: return "X86ISD::FMAX";
25309 case X86ISD::FMAXS: return "X86ISD::FMAXS";
25310 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
25311 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
25312 case X86ISD::FMIN: return "X86ISD::FMIN";
25313 case X86ISD::FMINS: return "X86ISD::FMINS";
25314 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
25315 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
25316 case X86ISD::FMAXC: return "X86ISD::FMAXC";
25317 case X86ISD::FMINC: return "X86ISD::FMINC";
25318 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
25319 case X86ISD::FRCP: return "X86ISD::FRCP";
25320 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
25321 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
25322 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
25323 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
25324 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
25325 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
25326 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
25327 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
25328 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
25329 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
25330 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
25331 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
25332 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
25333 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
25334 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
25335 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
25336 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
25337 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
25338 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
25339 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
25340 case X86ISD::LADD: return "X86ISD::LADD";
25341 case X86ISD::LSUB: return "X86ISD::LSUB";
25342 case X86ISD::LOR: return "X86ISD::LOR";
25343 case X86ISD::LXOR: return "X86ISD::LXOR";
25344 case X86ISD::LAND: return "X86ISD::LAND";
25345 case X86ISD::LINC: return "X86ISD::LINC";
25346 case X86ISD::LDEC: return "X86ISD::LDEC";
25347 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
25348 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
25349 case X86ISD::VZEXT: return "X86ISD::VZEXT";
25350 case X86ISD::VSEXT: return "X86ISD::VSEXT";
25351 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
25352 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
25353 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
25354 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
25355 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
25356 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
25357 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
25358 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
25359 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
25360 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
25361 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
25362 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
25363 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
25364 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
25365 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
25366 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
25367 case X86ISD::VSHL: return "X86ISD::VSHL";
25368 case X86ISD::VSRL: return "X86ISD::VSRL";
25369 case X86ISD::VSRA: return "X86ISD::VSRA";
25370 case X86ISD::VSHLI: return "X86ISD::VSHLI";
25371 case X86ISD::VSRLI: return "X86ISD::VSRLI";
25372 case X86ISD::VSRAI: return "X86ISD::VSRAI";
25373 case X86ISD::VSRAV: return "X86ISD::VSRAV";
25374 case X86ISD::VROTLI: return "X86ISD::VROTLI";
25375 case X86ISD::VROTRI: return "X86ISD::VROTRI";
25376 case X86ISD::VPPERM: return "X86ISD::VPPERM";
25377 case X86ISD::CMPP: return "X86ISD::CMPP";
25378 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
25379 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
25380 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
25381 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
25382 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
25383 case X86ISD::ADD: return "X86ISD::ADD";
25384 case X86ISD::SUB: return "X86ISD::SUB";
25385 case X86ISD::ADC: return "X86ISD::ADC";
25386 case X86ISD::SBB: return "X86ISD::SBB";
25387 case X86ISD::SMUL: return "X86ISD::SMUL";
25388 case X86ISD::UMUL: return "X86ISD::UMUL";
25389 case X86ISD::SMUL8: return "X86ISD::SMUL8";
25390 case X86ISD::UMUL8: return "X86ISD::UMUL8";
25391 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
25392 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
25393 case X86ISD::INC: return "X86ISD::INC";
25394 case X86ISD::DEC: return "X86ISD::DEC";
25395 case X86ISD::OR: return "X86ISD::OR";
25396 case X86ISD::XOR: return "X86ISD::XOR";
25397 case X86ISD::AND: return "X86ISD::AND";
25398 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
25399 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
25400 case X86ISD::PTEST: return "X86ISD::PTEST";
25401 case X86ISD::TESTP: return "X86ISD::TESTP";
25402 case X86ISD::TESTM: return "X86ISD::TESTM";
25403 case X86ISD::TESTNM: return "X86ISD::TESTNM";
25404 case X86ISD::KORTEST: return "X86ISD::KORTEST";
25405 case X86ISD::KTEST: return "X86ISD::KTEST";
25406 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
25407 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
25408 case X86ISD::PACKSS: return "X86ISD::PACKSS";
25409 case X86ISD::PACKUS: return "X86ISD::PACKUS";
25410 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
25411 case X86ISD::VALIGN: return "X86ISD::VALIGN";
25412 case X86ISD::VSHLD: return "X86ISD::VSHLD";
25413 case X86ISD::VSHRD: return "X86ISD::VSHRD";
25414 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
25415 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
25416 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
25417 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
25418 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
25419 case X86ISD::SHUFP: return "X86ISD::SHUFP";
25420 case X86ISD::SHUF128: return "X86ISD::SHUF128";
25421 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
25422 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
25423 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
25424 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
25425 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
25426 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
25427 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
25428 case X86ISD::MOVSD: return "X86ISD::MOVSD";
25429 case X86ISD::MOVSS: return "X86ISD::MOVSS";
25430 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
25431 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
25432 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
25433 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
25434 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
25435 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
25436 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
25437 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
25438 case X86ISD::VPERMV: return "X86ISD::VPERMV";
25439 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
25440 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
25441 case X86ISD::VPERMI: return "X86ISD::VPERMI";
25442 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
25443 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
25444 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
25445 case X86ISD::VRANGE: return "X86ISD::VRANGE";
25446 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
25447 case X86ISD::VRANGES: return "X86ISD::VRANGES";
25448 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
25449 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
25450 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
25451 case X86ISD::PSADBW: return "X86ISD::PSADBW";
25452 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
25453 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
25454 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
25455 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
25456 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
25457 case X86ISD::MFENCE: return "X86ISD::MFENCE";
25458 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
25459 case X86ISD::SAHF: return "X86ISD::SAHF";
25460 case X86ISD::RDRAND: return "X86ISD::RDRAND";
25461 case X86ISD::RDSEED: return "X86ISD::RDSEED";
25462 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
25463 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
25464 case X86ISD::VPSHA: return "X86ISD::VPSHA";
25465 case X86ISD::VPSHL: return "X86ISD::VPSHL";
25466 case X86ISD::VPCOM: return "X86ISD::VPCOM";
25467 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
25468 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
25469 case X86ISD::FMSUB: return "X86ISD::FMSUB";
25470 case X86ISD::FNMADD: return "X86ISD::FNMADD";
25471 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
25472 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
25473 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
25474 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
25475 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
25476 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
25477 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
25478 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
25479 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
25480 case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
25481 case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
25482 case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
25483 case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
25484 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
25485 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
25486 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
25487 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
25488 case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
25489 case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
25490 case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
25491 case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
25492 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
25493 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
25494 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
25495 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
25496 case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
25497 case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
25498 case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
25499 case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
25500 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
25501 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
25502 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
25503 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
25504 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
25505 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
25506 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
25507 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
25508 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
25509 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
25510 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
25511 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
25512 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
25513 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
25514 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
25515 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
25516 case X86ISD::XTEST: return "X86ISD::XTEST";
25517 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
25518 case X86ISD::EXPAND: return "X86ISD::EXPAND";
25519 case X86ISD::SELECT: return "X86ISD::SELECT";
25520 case X86ISD::SELECTS: return "X86ISD::SELECTS";
25521 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
25522 case X86ISD::RCP14: return "X86ISD::RCP14";
25523 case X86ISD::RCP14S: return "X86ISD::RCP14S";
25524 case X86ISD::RCP28: return "X86ISD::RCP28";
25525 case X86ISD::RCP28S: return "X86ISD::RCP28S";
25526 case X86ISD::EXP2: return "X86ISD::EXP2";
25527 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
25528 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
25529 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
25530 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
25531 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
25532 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
25533 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
25534 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
25535 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
25536 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
25537 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
25538 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
25539 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
25540 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
25541 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
25542 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
25543 case X86ISD::SCALEF: return "X86ISD::SCALEF";
25544 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
25545 case X86ISD::ADDS: return "X86ISD::ADDS";
25546 case X86ISD::SUBS: return "X86ISD::SUBS";
25547 case X86ISD::AVG: return "X86ISD::AVG";
25548 case X86ISD::MULHRS: return "X86ISD::MULHRS";
25549 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
25550 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
25551 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
25552 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
25553 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
25554 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
25555 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
25556 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
25557 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
25558 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
25559 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
25560 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
25561 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
25562 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
25563 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
25564 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
25565 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
25566 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
25567 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
25568 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
25569 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
25570 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
25571 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
25572 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
25573 case X86ISD::LWPINS: return "X86ISD::LWPINS";
25574 case X86ISD::MGATHER: return "X86ISD::MGATHER";
25575 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
25576 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
25577 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
25578 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
25579 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
25580 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
25581 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
25582 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
25583 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
25588 /// Return true if the addressing mode represented by AM is legal for this
25589 /// target, for a load/store of the specified type.
25590 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
25591 const AddrMode &AM, Type *Ty,
25593 Instruction *I) const {
25594 // X86 supports extremely general addressing modes.
25595 CodeModel::Model M = getTargetMachine().getCodeModel();
25597 // X86 allows a sign-extended 32-bit immediate field as a displacement.
25598 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
25602 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
25604 // If a reference to this global requires an extra load, we can't fold it.
25605 if (isGlobalStubReference(GVFlags))
25608 // If BaseGV requires a register for the PIC base, we cannot also have a
25609 // BaseReg specified.
25610 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
25613 // If lower 4G is not available, then we must use rip-relative addressing.
25614 if ((M != CodeModel::Small || isPositionIndependent()) &&
25615 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
25619 switch (AM.Scale) {
25625 // These scales always work.
25630 // These scales are formed with basereg+scalereg. Only accept if there is
25635 default: // Other stuff never works.
25642 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
25643 unsigned Bits = Ty->getScalarSizeInBits();
25645 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
25646 // particularly cheaper than those without.
25650 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
25651 // shifts just as cheap as scalar ones.
25652 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
25655 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
25656 // fully general vector.
25660 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
25661 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25663 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
25664 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
25665 return NumBits1 > NumBits2;
25668 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
25669 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25672 if (!isTypeLegal(EVT::getEVT(Ty1)))
25675 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
25677 // Assuming the caller doesn't have a zeroext or signext return parameter,
25678 // truncation all the way down to i1 is valid.
25682 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
25683 return isInt<32>(Imm);
25686 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
25687 // Can also use sub to handle negated immediates.
25688 return isInt<32>(Imm);
25691 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
25692 if (!VT1.isInteger() || !VT2.isInteger())
25694 unsigned NumBits1 = VT1.getSizeInBits();
25695 unsigned NumBits2 = VT2.getSizeInBits();
25696 return NumBits1 > NumBits2;
25699 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
25700 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25701 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
25704 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
25705 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25706 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
25709 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
25710 EVT VT1 = Val.getValueType();
25711 if (isZExtFree(VT1, VT2))
25714 if (Val.getOpcode() != ISD::LOAD)
25717 if (!VT1.isSimple() || !VT1.isInteger() ||
25718 !VT2.isSimple() || !VT2.isInteger())
25721 switch (VT1.getSimpleVT().SimpleTy) {
25726 // X86 has 8, 16, and 32-bit zero-extending loads.
25733 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
25736 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
25737 if (!Subtarget.hasAnyFMA())
25740 VT = VT.getScalarType();
25742 if (!VT.isSimple())
25745 switch (VT.getSimpleVT().SimpleTy) {
25756 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
25757 // i16 instructions are longer (0x66 prefix) and potentially slower.
25758 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
25761 /// Targets can use this to indicate that they only support *some*
25762 /// VECTOR_SHUFFLE operations, those with specific masks.
25763 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
25764 /// are assumed to be legal.
25765 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
25766 if (!VT.isSimple())
25769 // Not for i1 vectors
25770 if (VT.getSimpleVT().getScalarType() == MVT::i1)
25773 // Very little shuffling can be done for 64-bit vectors right now.
25774 if (VT.getSimpleVT().getSizeInBits() == 64)
25777 // We only care that the types being shuffled are legal. The lowering can
25778 // handle any possible shuffle mask that results.
25779 return isTypeLegal(VT.getSimpleVT());
25783 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
25785 // Just delegate to the generic legality, clear masks aren't special.
25786 return isShuffleMaskLegal(Mask, VT);
25789 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
25790 // If the subtarget is using retpolines, we need to not generate jump tables.
25791 if (Subtarget.useRetpoline())
25794 // Otherwise, fallback on the generic logic.
25795 return TargetLowering::areJTsAllowed(Fn);
25798 //===----------------------------------------------------------------------===//
25799 // X86 Scheduler Hooks
25800 //===----------------------------------------------------------------------===//
25802 /// Utility function to emit xbegin specifying the start of an RTM region.
25803 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
25804 const TargetInstrInfo *TII) {
25805 DebugLoc DL = MI.getDebugLoc();
25807 const BasicBlock *BB = MBB->getBasicBlock();
25808 MachineFunction::iterator I = ++MBB->getIterator();
25810 // For the v = xbegin(), we generate
25819 // eax = # XABORT_DEF
25823 // v = phi(s0/mainBB, s1/fallBB)
25825 MachineBasicBlock *thisMBB = MBB;
25826 MachineFunction *MF = MBB->getParent();
25827 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25828 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
25829 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25830 MF->insert(I, mainMBB);
25831 MF->insert(I, fallMBB);
25832 MF->insert(I, sinkMBB);
25834 // Transfer the remainder of BB and its successor edges to sinkMBB.
25835 sinkMBB->splice(sinkMBB->begin(), MBB,
25836 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25837 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25839 MachineRegisterInfo &MRI = MF->getRegInfo();
25840 unsigned DstReg = MI.getOperand(0).getReg();
25841 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25842 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25843 unsigned fallDstReg = MRI.createVirtualRegister(RC);
25847 // # fallthrough to mainMBB
25848 // # abortion to fallMBB
25849 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
25850 thisMBB->addSuccessor(mainMBB);
25851 thisMBB->addSuccessor(fallMBB);
25854 // mainDstReg := -1
25855 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
25856 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25857 mainMBB->addSuccessor(sinkMBB);
25860 // ; pseudo instruction to model hardware's definition from XABORT
25861 // EAX := XABORT_DEF
25862 // fallDstReg := EAX
25863 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
25864 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
25866 fallMBB->addSuccessor(sinkMBB);
25869 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
25870 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
25871 .addReg(mainDstReg).addMBB(mainMBB)
25872 .addReg(fallDstReg).addMBB(fallMBB);
25874 MI.eraseFromParent();
25878 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
25879 // or XMM0_V32I8 in AVX all of this code can be replaced with that
25880 // in the .td file.
25881 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
25882 const TargetInstrInfo *TII) {
25884 switch (MI.getOpcode()) {
25885 default: llvm_unreachable("illegal opcode!");
25886 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
25887 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
25888 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
25889 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
25890 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
25891 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
25892 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
25893 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
25896 DebugLoc dl = MI.getDebugLoc();
25897 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25899 unsigned NumArgs = MI.getNumOperands();
25900 for (unsigned i = 1; i < NumArgs; ++i) {
25901 MachineOperand &Op = MI.getOperand(i);
25902 if (!(Op.isReg() && Op.isImplicit()))
25905 if (MI.hasOneMemOperand())
25906 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25908 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25909 .addReg(X86::XMM0);
25911 MI.eraseFromParent();
25915 // FIXME: Custom handling because TableGen doesn't support multiple implicit
25916 // defs in an instruction pattern
25917 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
25918 const TargetInstrInfo *TII) {
25920 switch (MI.getOpcode()) {
25921 default: llvm_unreachable("illegal opcode!");
25922 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
25923 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
25924 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
25925 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
25926 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
25927 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
25928 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
25929 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25932 DebugLoc dl = MI.getDebugLoc();
25933 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25935 unsigned NumArgs = MI.getNumOperands(); // remove the results
25936 for (unsigned i = 1; i < NumArgs; ++i) {
25937 MachineOperand &Op = MI.getOperand(i);
25938 if (!(Op.isReg() && Op.isImplicit()))
25941 if (MI.hasOneMemOperand())
25942 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25944 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25947 MI.eraseFromParent();
25951 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25952 const X86Subtarget &Subtarget) {
25953 DebugLoc dl = MI.getDebugLoc();
25954 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25956 // insert input VAL into EAX
25957 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25958 .addReg(MI.getOperand(0).getReg());
25959 // insert zero to ECX
25960 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25962 // insert zero to EDX
25963 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25965 // insert WRPKRU instruction
25966 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25968 MI.eraseFromParent(); // The pseudo is gone now.
25972 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25973 const X86Subtarget &Subtarget) {
25974 DebugLoc dl = MI.getDebugLoc();
25975 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25977 // insert zero to ECX
25978 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25980 // insert RDPKRU instruction
25981 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25982 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25985 MI.eraseFromParent(); // The pseudo is gone now.
25989 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
25990 const X86Subtarget &Subtarget,
25992 DebugLoc dl = MI.getDebugLoc();
25993 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25994 // Address into RAX/EAX, other two args into ECX, EDX.
25995 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25996 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25997 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25998 for (int i = 0; i < X86::AddrNumOperands; ++i)
25999 MIB.add(MI.getOperand(i));
26001 unsigned ValOps = X86::AddrNumOperands;
26002 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
26003 .addReg(MI.getOperand(ValOps).getReg());
26004 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
26005 .addReg(MI.getOperand(ValOps + 1).getReg());
26007 // The instruction doesn't actually take any operands though.
26008 BuildMI(*BB, MI, dl, TII->get(Opc));
26010 MI.eraseFromParent(); // The pseudo is gone now.
26014 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
26015 const X86Subtarget &Subtarget) {
26016 DebugLoc dl = MI->getDebugLoc();
26017 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26018 // Address into RAX/EAX
26019 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26020 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26021 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26022 for (int i = 0; i < X86::AddrNumOperands; ++i)
26023 MIB.add(MI->getOperand(i));
26025 // The instruction doesn't actually take any operands though.
26026 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
26028 MI->eraseFromParent(); // The pseudo is gone now.
26034 MachineBasicBlock *
26035 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
26036 MachineBasicBlock *MBB) const {
26037 // Emit va_arg instruction on X86-64.
26039 // Operands to this pseudo-instruction:
26040 // 0 ) Output : destination address (reg)
26041 // 1-5) Input : va_list address (addr, i64mem)
26042 // 6 ) ArgSize : Size (in bytes) of vararg type
26043 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
26044 // 8 ) Align : Alignment of type
26045 // 9 ) EFLAGS (implicit-def)
26047 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
26048 static_assert(X86::AddrNumOperands == 5,
26049 "VAARG_64 assumes 5 address operands");
26051 unsigned DestReg = MI.getOperand(0).getReg();
26052 MachineOperand &Base = MI.getOperand(1);
26053 MachineOperand &Scale = MI.getOperand(2);
26054 MachineOperand &Index = MI.getOperand(3);
26055 MachineOperand &Disp = MI.getOperand(4);
26056 MachineOperand &Segment = MI.getOperand(5);
26057 unsigned ArgSize = MI.getOperand(6).getImm();
26058 unsigned ArgMode = MI.getOperand(7).getImm();
26059 unsigned Align = MI.getOperand(8).getImm();
26061 // Memory Reference
26062 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
26063 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26064 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26066 // Machine Information
26067 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26068 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
26069 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
26070 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
26071 DebugLoc DL = MI.getDebugLoc();
26073 // struct va_list {
26076 // i64 overflow_area (address)
26077 // i64 reg_save_area (address)
26079 // sizeof(va_list) = 24
26080 // alignment(va_list) = 8
26082 unsigned TotalNumIntRegs = 6;
26083 unsigned TotalNumXMMRegs = 8;
26084 bool UseGPOffset = (ArgMode == 1);
26085 bool UseFPOffset = (ArgMode == 2);
26086 unsigned MaxOffset = TotalNumIntRegs * 8 +
26087 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
26089 /* Align ArgSize to a multiple of 8 */
26090 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
26091 bool NeedsAlign = (Align > 8);
26093 MachineBasicBlock *thisMBB = MBB;
26094 MachineBasicBlock *overflowMBB;
26095 MachineBasicBlock *offsetMBB;
26096 MachineBasicBlock *endMBB;
26098 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
26099 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
26100 unsigned OffsetReg = 0;
26102 if (!UseGPOffset && !UseFPOffset) {
26103 // If we only pull from the overflow region, we don't create a branch.
26104 // We don't need to alter control flow.
26105 OffsetDestReg = 0; // unused
26106 OverflowDestReg = DestReg;
26108 offsetMBB = nullptr;
26109 overflowMBB = thisMBB;
26112 // First emit code to check if gp_offset (or fp_offset) is below the bound.
26113 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
26114 // If not, pull from overflow_area. (branch to overflowMBB)
26119 // offsetMBB overflowMBB
26124 // Registers for the PHI in endMBB
26125 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
26126 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
26128 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26129 MachineFunction *MF = MBB->getParent();
26130 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26131 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26132 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26134 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26136 // Insert the new basic blocks
26137 MF->insert(MBBIter, offsetMBB);
26138 MF->insert(MBBIter, overflowMBB);
26139 MF->insert(MBBIter, endMBB);
26141 // Transfer the remainder of MBB and its successor edges to endMBB.
26142 endMBB->splice(endMBB->begin(), thisMBB,
26143 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
26144 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
26146 // Make offsetMBB and overflowMBB successors of thisMBB
26147 thisMBB->addSuccessor(offsetMBB);
26148 thisMBB->addSuccessor(overflowMBB);
26150 // endMBB is a successor of both offsetMBB and overflowMBB
26151 offsetMBB->addSuccessor(endMBB);
26152 overflowMBB->addSuccessor(endMBB);
26154 // Load the offset value into a register
26155 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26156 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
26160 .addDisp(Disp, UseFPOffset ? 4 : 0)
26162 .setMemRefs(MMOBegin, MMOEnd);
26164 // Check if there is enough room left to pull this argument.
26165 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
26167 .addImm(MaxOffset + 8 - ArgSizeA8);
26169 // Branch to "overflowMBB" if offset >= max
26170 // Fall through to "offsetMBB" otherwise
26171 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
26172 .addMBB(overflowMBB);
26175 // In offsetMBB, emit code to use the reg_save_area.
26177 assert(OffsetReg != 0);
26179 // Read the reg_save_area address.
26180 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
26181 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
26187 .setMemRefs(MMOBegin, MMOEnd);
26189 // Zero-extend the offset
26190 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
26191 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
26194 .addImm(X86::sub_32bit);
26196 // Add the offset to the reg_save_area to get the final address.
26197 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
26198 .addReg(OffsetReg64)
26199 .addReg(RegSaveReg);
26201 // Compute the offset for the next argument
26202 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26203 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
26205 .addImm(UseFPOffset ? 16 : 8);
26207 // Store it back into the va_list.
26208 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
26212 .addDisp(Disp, UseFPOffset ? 4 : 0)
26214 .addReg(NextOffsetReg)
26215 .setMemRefs(MMOBegin, MMOEnd);
26218 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
26223 // Emit code to use overflow area
26226 // Load the overflow_area address into a register.
26227 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
26228 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
26234 .setMemRefs(MMOBegin, MMOEnd);
26236 // If we need to align it, do so. Otherwise, just copy the address
26237 // to OverflowDestReg.
26239 // Align the overflow address
26240 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
26241 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
26243 // aligned_addr = (addr + (align-1)) & ~(align-1)
26244 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
26245 .addReg(OverflowAddrReg)
26248 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
26250 .addImm(~(uint64_t)(Align-1));
26252 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
26253 .addReg(OverflowAddrReg);
26256 // Compute the next overflow address after this argument.
26257 // (the overflow address should be kept 8-byte aligned)
26258 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
26259 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
26260 .addReg(OverflowDestReg)
26261 .addImm(ArgSizeA8);
26263 // Store the new overflow address.
26264 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
26270 .addReg(NextAddrReg)
26271 .setMemRefs(MMOBegin, MMOEnd);
26273 // If we branched, emit the PHI to the front of endMBB.
26275 BuildMI(*endMBB, endMBB->begin(), DL,
26276 TII->get(X86::PHI), DestReg)
26277 .addReg(OffsetDestReg).addMBB(offsetMBB)
26278 .addReg(OverflowDestReg).addMBB(overflowMBB);
26281 // Erase the pseudo instruction
26282 MI.eraseFromParent();
26287 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
26288 MachineInstr &MI, MachineBasicBlock *MBB) const {
26289 // Emit code to save XMM registers to the stack. The ABI says that the
26290 // number of registers to save is given in %al, so it's theoretically
26291 // possible to do an indirect jump trick to avoid saving all of them,
26292 // however this code takes a simpler approach and just executes all
26293 // of the stores if %al is non-zero. It's less code, and it's probably
26294 // easier on the hardware branch predictor, and stores aren't all that
26295 // expensive anyway.
26297 // Create the new basic blocks. One block contains all the XMM stores,
26298 // and one block is the final destination regardless of whether any
26299 // stores were performed.
26300 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26301 MachineFunction *F = MBB->getParent();
26302 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26303 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
26304 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
26305 F->insert(MBBIter, XMMSaveMBB);
26306 F->insert(MBBIter, EndMBB);
26308 // Transfer the remainder of MBB and its successor edges to EndMBB.
26309 EndMBB->splice(EndMBB->begin(), MBB,
26310 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26311 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
26313 // The original block will now fall through to the XMM save block.
26314 MBB->addSuccessor(XMMSaveMBB);
26315 // The XMMSaveMBB will fall through to the end block.
26316 XMMSaveMBB->addSuccessor(EndMBB);
26318 // Now add the instructions.
26319 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26320 DebugLoc DL = MI.getDebugLoc();
26322 unsigned CountReg = MI.getOperand(0).getReg();
26323 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
26324 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
26326 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
26327 // If %al is 0, branch around the XMM save block.
26328 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
26329 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
26330 MBB->addSuccessor(EndMBB);
26333 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
26334 // that was just emitted, but clearly shouldn't be "saved".
26335 assert((MI.getNumOperands() <= 3 ||
26336 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
26337 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
26338 "Expected last argument to be EFLAGS");
26339 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
26340 // In the XMM save block, save all the XMM argument registers.
26341 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
26342 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
26343 MachineMemOperand *MMO = F->getMachineMemOperand(
26344 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
26345 MachineMemOperand::MOStore,
26346 /*Size=*/16, /*Align=*/16);
26347 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
26348 .addFrameIndex(RegSaveFrameIndex)
26349 .addImm(/*Scale=*/1)
26350 .addReg(/*IndexReg=*/0)
26351 .addImm(/*Disp=*/Offset)
26352 .addReg(/*Segment=*/0)
26353 .addReg(MI.getOperand(i).getReg())
26354 .addMemOperand(MMO);
26357 MI.eraseFromParent(); // The pseudo instruction is gone now.
26362 // The EFLAGS operand of SelectItr might be missing a kill marker
26363 // because there were multiple uses of EFLAGS, and ISel didn't know
26364 // which to mark. Figure out whether SelectItr should have had a
26365 // kill marker, and set it if it should. Returns the correct kill
26367 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
26368 MachineBasicBlock* BB,
26369 const TargetRegisterInfo* TRI) {
26370 // Scan forward through BB for a use/def of EFLAGS.
26371 MachineBasicBlock::iterator miI(std::next(SelectItr));
26372 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
26373 const MachineInstr& mi = *miI;
26374 if (mi.readsRegister(X86::EFLAGS))
26376 if (mi.definesRegister(X86::EFLAGS))
26377 break; // Should have kill-flag - update below.
26380 // If we hit the end of the block, check whether EFLAGS is live into a
26382 if (miI == BB->end()) {
26383 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
26384 sEnd = BB->succ_end();
26385 sItr != sEnd; ++sItr) {
26386 MachineBasicBlock* succ = *sItr;
26387 if (succ->isLiveIn(X86::EFLAGS))
26392 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
26393 // out. SelectMI should have a kill flag on EFLAGS.
26394 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
26398 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
26399 // together with other CMOV pseudo-opcodes into a single basic-block with
26400 // conditional jump around it.
26401 static bool isCMOVPseudo(MachineInstr &MI) {
26402 switch (MI.getOpcode()) {
26403 case X86::CMOV_FR32:
26404 case X86::CMOV_FR64:
26405 case X86::CMOV_GR8:
26406 case X86::CMOV_GR16:
26407 case X86::CMOV_GR32:
26408 case X86::CMOV_RFP32:
26409 case X86::CMOV_RFP64:
26410 case X86::CMOV_RFP80:
26411 case X86::CMOV_V2F64:
26412 case X86::CMOV_V2I64:
26413 case X86::CMOV_V4F32:
26414 case X86::CMOV_V4F64:
26415 case X86::CMOV_V4I64:
26416 case X86::CMOV_V16F32:
26417 case X86::CMOV_V8F32:
26418 case X86::CMOV_V8F64:
26419 case X86::CMOV_V8I64:
26420 case X86::CMOV_V8I1:
26421 case X86::CMOV_V16I1:
26422 case X86::CMOV_V32I1:
26423 case X86::CMOV_V64I1:
26431 // Helper function, which inserts PHI functions into SinkMBB:
26432 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
26433 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
26434 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
26435 // the last PHI function inserted.
26436 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
26437 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
26438 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
26439 MachineBasicBlock *SinkMBB) {
26440 MachineFunction *MF = TrueMBB->getParent();
26441 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
26442 DebugLoc DL = MIItBegin->getDebugLoc();
26444 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
26445 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26447 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
26449 // As we are creating the PHIs, we have to be careful if there is more than
26450 // one. Later CMOVs may reference the results of earlier CMOVs, but later
26451 // PHIs have to reference the individual true/false inputs from earlier PHIs.
26452 // That also means that PHI construction must work forward from earlier to
26453 // later, and that the code must maintain a mapping from earlier PHI's
26454 // destination registers, and the registers that went into the PHI.
26455 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
26456 MachineInstrBuilder MIB;
26458 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
26459 unsigned DestReg = MIIt->getOperand(0).getReg();
26460 unsigned Op1Reg = MIIt->getOperand(1).getReg();
26461 unsigned Op2Reg = MIIt->getOperand(2).getReg();
26463 // If this CMOV we are generating is the opposite condition from
26464 // the jump we generated, then we have to swap the operands for the
26465 // PHI that is going to be generated.
26466 if (MIIt->getOperand(3).getImm() == OppCC)
26467 std::swap(Op1Reg, Op2Reg);
26469 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
26470 Op1Reg = RegRewriteTable[Op1Reg].first;
26472 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
26473 Op2Reg = RegRewriteTable[Op2Reg].second;
26475 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
26481 // Add this PHI to the rewrite table.
26482 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
26488 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
26489 MachineBasicBlock *
26490 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
26491 MachineInstr &SecondCascadedCMOV,
26492 MachineBasicBlock *ThisMBB) const {
26493 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26494 DebugLoc DL = FirstCMOV.getDebugLoc();
26496 // We lower cascaded CMOVs such as
26498 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
26500 // to two successive branches.
26502 // Without this, we would add a PHI between the two jumps, which ends up
26503 // creating a few copies all around. For instance, for
26505 // (sitofp (zext (fcmp une)))
26507 // we would generate:
26509 // ucomiss %xmm1, %xmm0
26510 // movss <1.0f>, %xmm0
26511 // movaps %xmm0, %xmm1
26513 // xorps %xmm1, %xmm1
26516 // movaps %xmm1, %xmm0
26520 // because this custom-inserter would have generated:
26532 // A: X = ...; Y = ...
26534 // C: Z = PHI [X, A], [Y, B]
26536 // E: PHI [X, C], [Z, D]
26538 // If we lower both CMOVs in a single step, we can instead generate:
26550 // A: X = ...; Y = ...
26552 // E: PHI [X, A], [X, C], [Y, D]
26554 // Which, in our sitofp/fcmp example, gives us something like:
26556 // ucomiss %xmm1, %xmm0
26557 // movss <1.0f>, %xmm0
26560 // xorps %xmm0, %xmm0
26565 // We lower cascaded CMOV into two successive branches to the same block.
26566 // EFLAGS is used by both, so mark it as live in the second.
26567 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26568 MachineFunction *F = ThisMBB->getParent();
26569 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26570 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26571 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26573 MachineFunction::iterator It = ++ThisMBB->getIterator();
26574 F->insert(It, FirstInsertedMBB);
26575 F->insert(It, SecondInsertedMBB);
26576 F->insert(It, SinkMBB);
26578 // For a cascaded CMOV, we lower it to two successive branches to
26579 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
26580 // the FirstInsertedMBB.
26581 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
26583 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26584 // live into the sink and copy blocks.
26585 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26586 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
26587 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
26588 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
26589 SinkMBB->addLiveIn(X86::EFLAGS);
26592 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26593 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26594 std::next(MachineBasicBlock::iterator(FirstCMOV)),
26596 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26598 // Fallthrough block for ThisMBB.
26599 ThisMBB->addSuccessor(FirstInsertedMBB);
26600 // The true block target of the first branch is always SinkMBB.
26601 ThisMBB->addSuccessor(SinkMBB);
26602 // Fallthrough block for FirstInsertedMBB.
26603 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
26604 // The true block for the branch of FirstInsertedMBB.
26605 FirstInsertedMBB->addSuccessor(SinkMBB);
26606 // This is fallthrough.
26607 SecondInsertedMBB->addSuccessor(SinkMBB);
26609 // Create the conditional branch instructions.
26610 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
26611 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
26612 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26614 X86::CondCode SecondCC =
26615 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
26616 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
26617 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
26620 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
26621 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
26622 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
26623 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
26624 MachineInstrBuilder MIB =
26625 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
26627 .addMBB(SecondInsertedMBB)
26631 // The second SecondInsertedMBB provides the same incoming value as the
26632 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
26633 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
26634 // Copy the PHI result to the register defined by the second CMOV.
26635 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
26636 TII->get(TargetOpcode::COPY),
26637 SecondCascadedCMOV.getOperand(0).getReg())
26638 .addReg(FirstCMOV.getOperand(0).getReg());
26640 // Now remove the CMOVs.
26641 FirstCMOV.eraseFromParent();
26642 SecondCascadedCMOV.eraseFromParent();
26647 MachineBasicBlock *
26648 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
26649 MachineBasicBlock *ThisMBB) const {
26650 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26651 DebugLoc DL = MI.getDebugLoc();
26653 // To "insert" a SELECT_CC instruction, we actually have to insert the
26654 // diamond control-flow pattern. The incoming instruction knows the
26655 // destination vreg to set, the condition code register to branch on, the
26656 // true/false values to select between and a branch opcode to use.
26661 // cmpTY ccX, r1, r2
26663 // fallthrough --> FalseMBB
26665 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
26666 // as described above, by inserting a BB, and then making a PHI at the join
26667 // point to select the true and false operands of the CMOV in the PHI.
26669 // The code also handles two different cases of multiple CMOV opcodes
26673 // In this case, there are multiple CMOVs in a row, all which are based on
26674 // the same condition setting (or the exact opposite condition setting).
26675 // In this case we can lower all the CMOVs using a single inserted BB, and
26676 // then make a number of PHIs at the join point to model the CMOVs. The only
26677 // trickiness here, is that in a case like:
26679 // t2 = CMOV cond1 t1, f1
26680 // t3 = CMOV cond1 t2, f2
26682 // when rewriting this into PHIs, we have to perform some renaming on the
26683 // temps since you cannot have a PHI operand refer to a PHI result earlier
26684 // in the same block. The "simple" but wrong lowering would be:
26686 // t2 = PHI t1(BB1), f1(BB2)
26687 // t3 = PHI t2(BB1), f2(BB2)
26689 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
26690 // renaming is to note that on the path through BB1, t2 is really just a
26691 // copy of t1, and do that renaming, properly generating:
26693 // t2 = PHI t1(BB1), f1(BB2)
26694 // t3 = PHI t1(BB1), f2(BB2)
26697 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
26698 // function - EmitLoweredCascadedSelect.
26700 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
26701 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26702 MachineInstr *LastCMOV = &MI;
26703 MachineBasicBlock::iterator NextMIIt =
26704 std::next(MachineBasicBlock::iterator(MI));
26706 // Check for case 1, where there are multiple CMOVs with the same condition
26707 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
26708 // number of jumps the most.
26710 if (isCMOVPseudo(MI)) {
26711 // See if we have a string of CMOVS with the same condition.
26712 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
26713 (NextMIIt->getOperand(3).getImm() == CC ||
26714 NextMIIt->getOperand(3).getImm() == OppCC)) {
26715 LastCMOV = &*NextMIIt;
26720 // This checks for case 2, but only do this if we didn't already find
26721 // case 1, as indicated by LastCMOV == MI.
26722 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
26723 NextMIIt->getOpcode() == MI.getOpcode() &&
26724 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
26725 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
26726 NextMIIt->getOperand(1).isKill()) {
26727 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
26730 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26731 MachineFunction *F = ThisMBB->getParent();
26732 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
26733 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26735 MachineFunction::iterator It = ++ThisMBB->getIterator();
26736 F->insert(It, FalseMBB);
26737 F->insert(It, SinkMBB);
26739 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26740 // live into the sink and copy blocks.
26741 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26742 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
26743 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
26744 FalseMBB->addLiveIn(X86::EFLAGS);
26745 SinkMBB->addLiveIn(X86::EFLAGS);
26748 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26749 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26750 std::next(MachineBasicBlock::iterator(LastCMOV)),
26752 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26754 // Fallthrough block for ThisMBB.
26755 ThisMBB->addSuccessor(FalseMBB);
26756 // The true block target of the first (or only) branch is always a SinkMBB.
26757 ThisMBB->addSuccessor(SinkMBB);
26758 // Fallthrough block for FalseMBB.
26759 FalseMBB->addSuccessor(SinkMBB);
26761 // Create the conditional branch instruction.
26762 unsigned Opc = X86::GetCondBranchFromCond(CC);
26763 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26766 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
26768 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
26769 MachineBasicBlock::iterator MIItEnd =
26770 std::next(MachineBasicBlock::iterator(LastCMOV));
26771 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
26773 // Now remove the CMOV(s).
26774 ThisMBB->erase(MIItBegin, MIItEnd);
26779 MachineBasicBlock *
26780 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
26781 MachineBasicBlock *BB) const {
26782 // Combine the following atomic floating-point modification pattern:
26783 // a.store(reg OP a.load(acquire), release)
26784 // Transform them into:
26785 // OPss (%gpr), %xmm
26786 // movss %xmm, (%gpr)
26787 // Or sd equivalent for 64-bit operations.
26789 switch (MI.getOpcode()) {
26790 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
26791 case X86::RELEASE_FADD32mr:
26792 FOp = X86::ADDSSrm;
26793 MOp = X86::MOVSSmr;
26795 case X86::RELEASE_FADD64mr:
26796 FOp = X86::ADDSDrm;
26797 MOp = X86::MOVSDmr;
26800 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26801 DebugLoc DL = MI.getDebugLoc();
26802 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
26803 unsigned ValOpIdx = X86::AddrNumOperands;
26804 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
26805 MachineInstrBuilder MIB =
26806 BuildMI(*BB, MI, DL, TII->get(FOp),
26807 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
26809 for (int i = 0; i < X86::AddrNumOperands; ++i) {
26810 MachineOperand &Operand = MI.getOperand(i);
26811 // Clear any kill flags on register operands as we'll create a second
26812 // instruction using the same address operands.
26813 if (Operand.isReg())
26814 Operand.setIsKill(false);
26817 MachineInstr *FOpMI = MIB;
26818 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
26819 for (int i = 0; i < X86::AddrNumOperands; ++i)
26820 MIB.add(MI.getOperand(i));
26821 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
26822 MI.eraseFromParent(); // The pseudo instruction is gone now.
26826 MachineBasicBlock *
26827 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
26828 MachineBasicBlock *BB) const {
26829 MachineFunction *MF = BB->getParent();
26830 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26831 DebugLoc DL = MI.getDebugLoc();
26832 const BasicBlock *LLVM_BB = BB->getBasicBlock();
26834 assert(MF->shouldSplitStack());
26836 const bool Is64Bit = Subtarget.is64Bit();
26837 const bool IsLP64 = Subtarget.isTarget64BitLP64();
26839 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
26840 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
26843 // ... [Till the alloca]
26844 // If stacklet is not large enough, jump to mallocMBB
26847 // Allocate by subtracting from RSP
26848 // Jump to continueMBB
26851 // Allocate by call to runtime
26855 // [rest of original BB]
26858 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26859 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26860 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26862 MachineRegisterInfo &MRI = MF->getRegInfo();
26863 const TargetRegisterClass *AddrRegClass =
26864 getRegClassFor(getPointerTy(MF->getDataLayout()));
26866 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
26867 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
26868 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
26869 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
26870 sizeVReg = MI.getOperand(1).getReg(),
26872 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
26874 MachineFunction::iterator MBBIter = ++BB->getIterator();
26876 MF->insert(MBBIter, bumpMBB);
26877 MF->insert(MBBIter, mallocMBB);
26878 MF->insert(MBBIter, continueMBB);
26880 continueMBB->splice(continueMBB->begin(), BB,
26881 std::next(MachineBasicBlock::iterator(MI)), BB->end());
26882 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
26884 // Add code to the main basic block to check if the stack limit has been hit,
26885 // and if so, jump to mallocMBB otherwise to bumpMBB.
26886 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
26887 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
26888 .addReg(tmpSPVReg).addReg(sizeVReg);
26889 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
26890 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
26891 .addReg(SPLimitVReg);
26892 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
26894 // bumpMBB simply decreases the stack pointer, since we know the current
26895 // stacklet has enough space.
26896 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
26897 .addReg(SPLimitVReg);
26898 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
26899 .addReg(SPLimitVReg);
26900 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26902 // Calls into a routine in libgcc to allocate more space from the heap.
26903 const uint32_t *RegMask =
26904 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
26906 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
26908 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26909 .addExternalSymbol("__morestack_allocate_stack_space")
26910 .addRegMask(RegMask)
26911 .addReg(X86::RDI, RegState::Implicit)
26912 .addReg(X86::RAX, RegState::ImplicitDefine);
26913 } else if (Is64Bit) {
26914 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
26916 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26917 .addExternalSymbol("__morestack_allocate_stack_space")
26918 .addRegMask(RegMask)
26919 .addReg(X86::EDI, RegState::Implicit)
26920 .addReg(X86::EAX, RegState::ImplicitDefine);
26922 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
26924 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
26925 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
26926 .addExternalSymbol("__morestack_allocate_stack_space")
26927 .addRegMask(RegMask)
26928 .addReg(X86::EAX, RegState::ImplicitDefine);
26932 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
26935 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
26936 .addReg(IsLP64 ? X86::RAX : X86::EAX);
26937 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26939 // Set up the CFG correctly.
26940 BB->addSuccessor(bumpMBB);
26941 BB->addSuccessor(mallocMBB);
26942 mallocMBB->addSuccessor(continueMBB);
26943 bumpMBB->addSuccessor(continueMBB);
26945 // Take care of the PHI nodes.
26946 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
26947 MI.getOperand(0).getReg())
26948 .addReg(mallocPtrVReg)
26950 .addReg(bumpSPPtrVReg)
26953 // Delete the original pseudo instruction.
26954 MI.eraseFromParent();
26957 return continueMBB;
26960 MachineBasicBlock *
26961 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
26962 MachineBasicBlock *BB) const {
26963 MachineFunction *MF = BB->getParent();
26964 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26965 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
26966 DebugLoc DL = MI.getDebugLoc();
26968 assert(!isAsynchronousEHPersonality(
26969 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
26970 "SEH does not use catchret!");
26972 // Only 32-bit EH needs to worry about manually restoring stack pointers.
26973 if (!Subtarget.is32Bit())
26976 // C++ EH creates a new target block to hold the restore code, and wires up
26977 // the new block to the return destination with a normal JMP_4.
26978 MachineBasicBlock *RestoreMBB =
26979 MF->CreateMachineBasicBlock(BB->getBasicBlock());
26980 assert(BB->succ_size() == 1);
26981 MF->insert(std::next(BB->getIterator()), RestoreMBB);
26982 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
26983 BB->addSuccessor(RestoreMBB);
26984 MI.getOperand(0).setMBB(RestoreMBB);
26986 auto RestoreMBBI = RestoreMBB->begin();
26987 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
26988 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
26992 MachineBasicBlock *
26993 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
26994 MachineBasicBlock *BB) const {
26995 MachineFunction *MF = BB->getParent();
26996 const Constant *PerFn = MF->getFunction().getPersonalityFn();
26997 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
26998 // Only 32-bit SEH requires special handling for catchpad.
26999 if (IsSEH && Subtarget.is32Bit()) {
27000 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27001 DebugLoc DL = MI.getDebugLoc();
27002 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
27004 MI.eraseFromParent();
27008 MachineBasicBlock *
27009 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
27010 MachineBasicBlock *BB) const {
27011 // So, here we replace TLSADDR with the sequence:
27012 // adjust_stackdown -> TLSADDR -> adjust_stackup.
27013 // We need this because TLSADDR is lowered into calls
27014 // inside MC, therefore without the two markers shrink-wrapping
27015 // may push the prologue/epilogue pass them.
27016 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27017 DebugLoc DL = MI.getDebugLoc();
27018 MachineFunction &MF = *BB->getParent();
27020 // Emit CALLSEQ_START right before the instruction.
27021 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
27022 MachineInstrBuilder CallseqStart =
27023 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
27024 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
27026 // Emit CALLSEQ_END right after the instruction.
27027 // We don't call erase from parent because we want to keep the
27028 // original instruction around.
27029 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
27030 MachineInstrBuilder CallseqEnd =
27031 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
27032 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
27037 MachineBasicBlock *
27038 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
27039 MachineBasicBlock *BB) const {
27040 // This is pretty easy. We're taking the value that we received from
27041 // our load from the relocation, sticking it in either RDI (x86-64)
27042 // or EAX and doing an indirect call. The return value will then
27043 // be in the normal return register.
27044 MachineFunction *F = BB->getParent();
27045 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27046 DebugLoc DL = MI.getDebugLoc();
27048 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
27049 assert(MI.getOperand(3).isGlobal() && "This should be a global");
27051 // Get a register mask for the lowered call.
27052 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
27053 // proper register mask.
27054 const uint32_t *RegMask =
27055 Subtarget.is64Bit() ?
27056 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
27057 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
27058 if (Subtarget.is64Bit()) {
27059 MachineInstrBuilder MIB =
27060 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
27064 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27065 MI.getOperand(3).getTargetFlags())
27067 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
27068 addDirectMem(MIB, X86::RDI);
27069 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
27070 } else if (!isPositionIndependent()) {
27071 MachineInstrBuilder MIB =
27072 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27076 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27077 MI.getOperand(3).getTargetFlags())
27079 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27080 addDirectMem(MIB, X86::EAX);
27081 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27083 MachineInstrBuilder MIB =
27084 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27085 .addReg(TII->getGlobalBaseReg(F))
27088 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27089 MI.getOperand(3).getTargetFlags())
27091 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27092 addDirectMem(MIB, X86::EAX);
27093 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27096 MI.eraseFromParent(); // The pseudo instruction is gone now.
27100 static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
27102 case X86::RETPOLINE_CALL32:
27103 return X86::CALLpcrel32;
27104 case X86::RETPOLINE_CALL64:
27105 return X86::CALL64pcrel32;
27106 case X86::RETPOLINE_TCRETURN32:
27107 return X86::TCRETURNdi;
27108 case X86::RETPOLINE_TCRETURN64:
27109 return X86::TCRETURNdi64;
27111 llvm_unreachable("not retpoline opcode");
27114 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
27116 if (Subtarget.useRetpolineExternalThunk()) {
27117 // When using an external thunk for retpolines, we pick names that match the
27118 // names GCC happens to use as well. This helps simplify the implementation
27119 // of the thunks for kernels where they have no easy ability to create
27120 // aliases and are doing non-trivial configuration of the thunk's body. For
27121 // example, the Linux kernel will do boot-time hot patching of the thunk
27122 // bodies and cannot easily export aliases of these to loaded modules.
27124 // Note that at any point in the future, we may need to change the semantics
27125 // of how we implement retpolines and at that time will likely change the
27126 // name of the called thunk. Essentially, there is no hard guarantee that
27127 // LLVM will generate calls to specific thunks, we merely make a best-effort
27128 // attempt to help out kernels and other systems where duplicating the
27129 // thunks is costly.
27132 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27133 return "__x86_indirect_thunk_eax";
27135 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27136 return "__x86_indirect_thunk_ecx";
27138 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27139 return "__x86_indirect_thunk_edx";
27141 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27142 return "__x86_indirect_thunk_edi";
27144 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27145 return "__x86_indirect_thunk_r11";
27147 llvm_unreachable("unexpected reg for retpoline");
27150 // When targeting an internal COMDAT thunk use an LLVM-specific name.
27153 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27154 return "__llvm_retpoline_eax";
27156 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27157 return "__llvm_retpoline_ecx";
27159 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27160 return "__llvm_retpoline_edx";
27162 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27163 return "__llvm_retpoline_edi";
27165 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27166 return "__llvm_retpoline_r11";
27168 llvm_unreachable("unexpected reg for retpoline");
27171 MachineBasicBlock *
27172 X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
27173 MachineBasicBlock *BB) const {
27174 // Copy the virtual register into the R11 physical register and
27175 // call the retpoline thunk.
27176 DebugLoc DL = MI.getDebugLoc();
27177 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27178 unsigned CalleeVReg = MI.getOperand(0).getReg();
27179 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
27181 // Find an available scratch register to hold the callee. On 64-bit, we can
27182 // just use R11, but we scan for uses anyway to ensure we don't generate
27183 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
27184 // already a register use operand to the call to hold the callee. If none
27185 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
27186 // register and ESI is the base pointer to realigned stack frames with VLAs.
27187 SmallVector<unsigned, 3> AvailableRegs;
27188 if (Subtarget.is64Bit())
27189 AvailableRegs.push_back(X86::R11);
27191 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
27193 // Zero out any registers that are already used.
27194 for (const auto &MO : MI.operands()) {
27195 if (MO.isReg() && MO.isUse())
27196 for (unsigned &Reg : AvailableRegs)
27197 if (Reg == MO.getReg())
27201 // Choose the first remaining non-zero available register.
27202 unsigned AvailableReg = 0;
27203 for (unsigned MaybeReg : AvailableRegs) {
27205 AvailableReg = MaybeReg;
27210 report_fatal_error("calling convention incompatible with retpoline, no "
27211 "available registers");
27213 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
27215 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
27216 .addReg(CalleeVReg);
27217 MI.getOperand(0).ChangeToES(Symbol);
27218 MI.setDesc(TII->get(Opc));
27219 MachineInstrBuilder(*BB->getParent(), &MI)
27220 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
27224 MachineBasicBlock *
27225 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
27226 MachineBasicBlock *MBB) const {
27227 DebugLoc DL = MI.getDebugLoc();
27228 MachineFunction *MF = MBB->getParent();
27229 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27230 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27231 MachineRegisterInfo &MRI = MF->getRegInfo();
27233 const BasicBlock *BB = MBB->getBasicBlock();
27234 MachineFunction::iterator I = ++MBB->getIterator();
27236 // Memory Reference
27237 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27238 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27241 unsigned MemOpndSlot = 0;
27243 unsigned CurOp = 0;
27245 DstReg = MI.getOperand(CurOp++).getReg();
27246 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
27247 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
27249 unsigned mainDstReg = MRI.createVirtualRegister(RC);
27250 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
27252 MemOpndSlot = CurOp;
27254 MVT PVT = getPointerTy(MF->getDataLayout());
27255 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27256 "Invalid Pointer Size!");
27258 // For v = setjmp(buf), we generate
27261 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
27262 // SjLjSetup restoreMBB
27268 // v = phi(main, restore)
27271 // if base pointer being used, load it from frame
27274 MachineBasicBlock *thisMBB = MBB;
27275 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
27276 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
27277 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
27278 MF->insert(I, mainMBB);
27279 MF->insert(I, sinkMBB);
27280 MF->push_back(restoreMBB);
27281 restoreMBB->setHasAddressTaken();
27283 MachineInstrBuilder MIB;
27285 // Transfer the remainder of BB and its successor edges to sinkMBB.
27286 sinkMBB->splice(sinkMBB->begin(), MBB,
27287 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
27288 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
27291 unsigned PtrStoreOpc = 0;
27292 unsigned LabelReg = 0;
27293 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27294 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27295 !isPositionIndependent();
27297 // Prepare IP either in reg or imm.
27298 if (!UseImmLabel) {
27299 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27300 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27301 LabelReg = MRI.createVirtualRegister(PtrRC);
27302 if (Subtarget.is64Bit()) {
27303 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
27307 .addMBB(restoreMBB)
27310 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
27311 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
27312 .addReg(XII->getGlobalBaseReg(MF))
27315 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
27319 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27321 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
27322 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27323 if (i == X86::AddrDisp)
27324 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
27326 MIB.add(MI.getOperand(MemOpndSlot + i));
27329 MIB.addReg(LabelReg);
27331 MIB.addMBB(restoreMBB);
27332 MIB.setMemRefs(MMOBegin, MMOEnd);
27334 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
27335 .addMBB(restoreMBB);
27337 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27338 MIB.addRegMask(RegInfo->getNoPreservedMask());
27339 thisMBB->addSuccessor(mainMBB);
27340 thisMBB->addSuccessor(restoreMBB);
27344 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
27345 mainMBB->addSuccessor(sinkMBB);
27348 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
27349 TII->get(X86::PHI), DstReg)
27350 .addReg(mainDstReg).addMBB(mainMBB)
27351 .addReg(restoreDstReg).addMBB(restoreMBB);
27354 if (RegInfo->hasBasePointer(*MF)) {
27355 const bool Uses64BitFramePtr =
27356 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27357 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
27358 X86FI->setRestoreBasePointer(MF);
27359 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
27360 unsigned BasePtr = RegInfo->getBaseRegister();
27361 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
27362 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
27363 FramePtr, true, X86FI->getRestoreBasePointerOffset())
27364 .setMIFlag(MachineInstr::FrameSetup);
27366 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
27367 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
27368 restoreMBB->addSuccessor(sinkMBB);
27370 MI.eraseFromParent();
27374 MachineBasicBlock *
27375 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
27376 MachineBasicBlock *MBB) const {
27377 DebugLoc DL = MI.getDebugLoc();
27378 MachineFunction *MF = MBB->getParent();
27379 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27380 MachineRegisterInfo &MRI = MF->getRegInfo();
27382 // Memory Reference
27383 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27384 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27386 MVT PVT = getPointerTy(MF->getDataLayout());
27387 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27388 "Invalid Pointer Size!");
27390 const TargetRegisterClass *RC =
27391 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
27392 unsigned Tmp = MRI.createVirtualRegister(RC);
27393 // Since FP is only updated here but NOT referenced, it's treated as GPR.
27394 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27395 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
27396 unsigned SP = RegInfo->getStackRegister();
27398 MachineInstrBuilder MIB;
27400 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27401 const int64_t SPOffset = 2 * PVT.getStoreSize();
27403 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
27404 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
27407 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
27408 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
27409 MIB.add(MI.getOperand(i));
27410 MIB.setMemRefs(MMOBegin, MMOEnd);
27412 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
27413 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27414 if (i == X86::AddrDisp)
27415 MIB.addDisp(MI.getOperand(i), LabelOffset);
27417 MIB.add(MI.getOperand(i));
27419 MIB.setMemRefs(MMOBegin, MMOEnd);
27421 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
27422 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27423 if (i == X86::AddrDisp)
27424 MIB.addDisp(MI.getOperand(i), SPOffset);
27426 MIB.add(MI.getOperand(i));
27428 MIB.setMemRefs(MMOBegin, MMOEnd);
27430 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
27432 MI.eraseFromParent();
27436 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
27437 MachineBasicBlock *MBB,
27438 MachineBasicBlock *DispatchBB,
27440 DebugLoc DL = MI.getDebugLoc();
27441 MachineFunction *MF = MBB->getParent();
27442 MachineRegisterInfo *MRI = &MF->getRegInfo();
27443 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27445 MVT PVT = getPointerTy(MF->getDataLayout());
27446 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
27451 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27452 !isPositionIndependent();
27455 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27457 const TargetRegisterClass *TRC =
27458 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
27459 VR = MRI->createVirtualRegister(TRC);
27460 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27462 if (Subtarget.is64Bit())
27463 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
27467 .addMBB(DispatchBB)
27470 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
27471 .addReg(0) /* TII->getGlobalBaseReg(MF) */
27474 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
27478 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
27479 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
27481 MIB.addMBB(DispatchBB);
27486 MachineBasicBlock *
27487 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
27488 MachineBasicBlock *BB) const {
27489 DebugLoc DL = MI.getDebugLoc();
27490 MachineFunction *MF = BB->getParent();
27491 MachineFrameInfo &MFI = MF->getFrameInfo();
27492 MachineRegisterInfo *MRI = &MF->getRegInfo();
27493 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27494 int FI = MFI.getFunctionContextIndex();
27496 // Get a mapping of the call site numbers to all of the landing pads they're
27497 // associated with.
27498 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
27499 unsigned MaxCSNum = 0;
27500 for (auto &MBB : *MF) {
27501 if (!MBB.isEHPad())
27504 MCSymbol *Sym = nullptr;
27505 for (const auto &MI : MBB) {
27506 if (MI.isDebugValue())
27509 assert(MI.isEHLabel() && "expected EH_LABEL");
27510 Sym = MI.getOperand(0).getMCSymbol();
27514 if (!MF->hasCallSiteLandingPad(Sym))
27517 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
27518 CallSiteNumToLPad[CSI].push_back(&MBB);
27519 MaxCSNum = std::max(MaxCSNum, CSI);
27523 // Get an ordered list of the machine basic blocks for the jump table.
27524 std::vector<MachineBasicBlock *> LPadList;
27525 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
27526 LPadList.reserve(CallSiteNumToLPad.size());
27528 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
27529 for (auto &LP : CallSiteNumToLPad[CSI]) {
27530 LPadList.push_back(LP);
27531 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
27535 assert(!LPadList.empty() &&
27536 "No landing pad destinations for the dispatch jump table!");
27538 // Create the MBBs for the dispatch code.
27540 // Shove the dispatch's address into the return slot in the function context.
27541 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
27542 DispatchBB->setIsEHPad(true);
27544 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
27545 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
27546 DispatchBB->addSuccessor(TrapBB);
27548 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
27549 DispatchBB->addSuccessor(DispContBB);
27552 MF->push_back(DispatchBB);
27553 MF->push_back(DispContBB);
27554 MF->push_back(TrapBB);
27556 // Insert code into the entry block that creates and registers the function
27558 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
27560 // Create the jump table and associated information
27561 unsigned JTE = getJumpTableEncoding();
27562 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
27563 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
27565 const X86RegisterInfo &RI = TII->getRegisterInfo();
27566 // Add a register mask with no preserved registers. This results in all
27567 // registers being marked as clobbered.
27568 if (RI.hasBasePointer(*MF)) {
27569 const bool FPIs64Bit =
27570 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27571 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
27572 MFI->setRestoreBasePointer(MF);
27574 unsigned FP = RI.getFrameRegister(*MF);
27575 unsigned BP = RI.getBaseRegister();
27576 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
27577 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
27578 MFI->getRestoreBasePointerOffset())
27579 .addRegMask(RI.getNoPreservedMask());
27581 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
27582 .addRegMask(RI.getNoPreservedMask());
27585 // IReg is used as an index in a memory operand and therefore can't be SP
27586 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
27587 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
27588 Subtarget.is64Bit() ? 8 : 4);
27589 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
27591 .addImm(LPadList.size());
27592 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
27594 if (Subtarget.is64Bit()) {
27595 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27596 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
27598 // leaq .LJTI0_0(%rip), BReg
27599 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
27603 .addJumpTableIndex(MJTI)
27605 // movzx IReg64, IReg
27606 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
27609 .addImm(X86::sub_32bit);
27612 case MachineJumpTableInfo::EK_BlockAddress:
27613 // jmpq *(BReg,IReg64,8)
27614 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
27621 case MachineJumpTableInfo::EK_LabelDifference32: {
27622 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
27623 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
27624 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27626 // movl (BReg,IReg64,4), OReg
27627 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
27633 // movsx OReg64, OReg
27634 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
27635 // addq BReg, OReg64, TReg
27636 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
27640 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
27644 llvm_unreachable("Unexpected jump table encoding");
27647 // jmpl *.LJTI0_0(,IReg,4)
27648 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
27652 .addJumpTableIndex(MJTI)
27656 // Add the jump table entries as successors to the MBB.
27657 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
27658 for (auto &LP : LPadList)
27659 if (SeenMBBs.insert(LP).second)
27660 DispContBB->addSuccessor(LP);
27662 // N.B. the order the invoke BBs are processed in doesn't matter here.
27663 SmallVector<MachineBasicBlock *, 64> MBBLPads;
27664 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
27665 for (MachineBasicBlock *MBB : InvokeBBs) {
27666 // Remove the landing pad successor from the invoke block and replace it
27667 // with the new dispatch block.
27668 // Keep a copy of Successors since it's modified inside the loop.
27669 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
27671 // FIXME: Avoid quadratic complexity.
27672 for (auto MBBS : Successors) {
27673 if (MBBS->isEHPad()) {
27674 MBB->removeSuccessor(MBBS);
27675 MBBLPads.push_back(MBBS);
27679 MBB->addSuccessor(DispatchBB);
27681 // Find the invoke call and mark all of the callee-saved registers as
27682 // 'implicit defined' so that they're spilled. This prevents code from
27683 // moving instructions to before the EH block, where they will never be
27685 for (auto &II : reverse(*MBB)) {
27689 DenseMap<unsigned, bool> DefRegs;
27690 for (auto &MOp : II.operands())
27692 DefRegs[MOp.getReg()] = true;
27694 MachineInstrBuilder MIB(*MF, &II);
27695 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
27696 unsigned Reg = SavedRegs[RI];
27698 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
27705 // Mark all former landing pads as non-landing pads. The dispatch is the only
27706 // landing pad now.
27707 for (auto &LP : MBBLPads)
27708 LP->setIsEHPad(false);
27710 // The instruction is gone now.
27711 MI.eraseFromParent();
27715 MachineBasicBlock *
27716 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
27717 MachineBasicBlock *BB) const {
27718 MachineFunction *MF = BB->getParent();
27719 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27720 DebugLoc DL = MI.getDebugLoc();
27722 switch (MI.getOpcode()) {
27723 default: llvm_unreachable("Unexpected instr type to insert");
27724 case X86::TAILJMPd64:
27725 case X86::TAILJMPr64:
27726 case X86::TAILJMPm64:
27727 case X86::TAILJMPr64_REX:
27728 case X86::TAILJMPm64_REX:
27729 llvm_unreachable("TAILJMP64 would not be touched here.");
27730 case X86::TCRETURNdi64:
27731 case X86::TCRETURNri64:
27732 case X86::TCRETURNmi64:
27734 case X86::TLS_addr32:
27735 case X86::TLS_addr64:
27736 case X86::TLS_base_addr32:
27737 case X86::TLS_base_addr64:
27738 return EmitLoweredTLSAddr(MI, BB);
27739 case X86::RETPOLINE_CALL32:
27740 case X86::RETPOLINE_CALL64:
27741 case X86::RETPOLINE_TCRETURN32:
27742 case X86::RETPOLINE_TCRETURN64:
27743 return EmitLoweredRetpoline(MI, BB);
27744 case X86::CATCHRET:
27745 return EmitLoweredCatchRet(MI, BB);
27746 case X86::CATCHPAD:
27747 return EmitLoweredCatchPad(MI, BB);
27748 case X86::SEG_ALLOCA_32:
27749 case X86::SEG_ALLOCA_64:
27750 return EmitLoweredSegAlloca(MI, BB);
27751 case X86::TLSCall_32:
27752 case X86::TLSCall_64:
27753 return EmitLoweredTLSCall(MI, BB);
27754 case X86::CMOV_FR32:
27755 case X86::CMOV_FR64:
27756 case X86::CMOV_FR128:
27757 case X86::CMOV_GR8:
27758 case X86::CMOV_GR16:
27759 case X86::CMOV_GR32:
27760 case X86::CMOV_RFP32:
27761 case X86::CMOV_RFP64:
27762 case X86::CMOV_RFP80:
27763 case X86::CMOV_V2F64:
27764 case X86::CMOV_V2I64:
27765 case X86::CMOV_V4F32:
27766 case X86::CMOV_V4F64:
27767 case X86::CMOV_V4I64:
27768 case X86::CMOV_V16F32:
27769 case X86::CMOV_V8F32:
27770 case X86::CMOV_V8F64:
27771 case X86::CMOV_V8I64:
27772 case X86::CMOV_V8I1:
27773 case X86::CMOV_V16I1:
27774 case X86::CMOV_V32I1:
27775 case X86::CMOV_V64I1:
27776 return EmitLoweredSelect(MI, BB);
27778 case X86::RDFLAGS32:
27779 case X86::RDFLAGS64: {
27781 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
27782 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
27783 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
27784 // Permit reads of the EFLAGS and DF registers without them being defined.
27785 // This intrinsic exists to read external processor state in flags, such as
27786 // the trap flag, interrupt flag, and direction flag, none of which are
27787 // modeled by the backend.
27788 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
27789 "Unexpected register in operand!");
27790 Push->getOperand(2).setIsUndef();
27791 assert(Push->getOperand(3).getReg() == X86::DF &&
27792 "Unexpected register in operand!");
27793 Push->getOperand(3).setIsUndef();
27794 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
27796 MI.eraseFromParent(); // The pseudo is gone now.
27800 case X86::WRFLAGS32:
27801 case X86::WRFLAGS64: {
27803 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
27805 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
27806 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
27807 BuildMI(*BB, MI, DL, TII->get(PopF));
27809 MI.eraseFromParent(); // The pseudo is gone now.
27813 case X86::RELEASE_FADD32mr:
27814 case X86::RELEASE_FADD64mr:
27815 return EmitLoweredAtomicFP(MI, BB);
27817 case X86::FP32_TO_INT16_IN_MEM:
27818 case X86::FP32_TO_INT32_IN_MEM:
27819 case X86::FP32_TO_INT64_IN_MEM:
27820 case X86::FP64_TO_INT16_IN_MEM:
27821 case X86::FP64_TO_INT32_IN_MEM:
27822 case X86::FP64_TO_INT64_IN_MEM:
27823 case X86::FP80_TO_INT16_IN_MEM:
27824 case X86::FP80_TO_INT32_IN_MEM:
27825 case X86::FP80_TO_INT64_IN_MEM: {
27826 // Change the floating point control register to use "round towards zero"
27827 // mode when truncating to an integer value.
27828 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
27829 addFrameReference(BuildMI(*BB, MI, DL,
27830 TII->get(X86::FNSTCW16m)), CWFrameIdx);
27832 // Load the old value of the high byte of the control word...
27834 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
27835 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
27838 // Set the high part to be round to zero...
27839 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
27842 // Reload the modified control word now...
27843 addFrameReference(BuildMI(*BB, MI, DL,
27844 TII->get(X86::FLDCW16m)), CWFrameIdx);
27846 // Restore the memory image of control word to original value
27847 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
27850 // Get the X86 opcode to use.
27852 switch (MI.getOpcode()) {
27853 default: llvm_unreachable("illegal opcode!");
27854 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
27855 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
27856 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
27857 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
27858 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
27859 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
27860 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
27861 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
27862 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
27865 X86AddressMode AM = getAddressFromInstr(&MI, 0);
27866 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
27867 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
27869 // Reload the original control word now.
27870 addFrameReference(BuildMI(*BB, MI, DL,
27871 TII->get(X86::FLDCW16m)), CWFrameIdx);
27873 MI.eraseFromParent(); // The pseudo instruction is gone now.
27876 // String/text processing lowering.
27877 case X86::PCMPISTRM128REG:
27878 case X86::VPCMPISTRM128REG:
27879 case X86::PCMPISTRM128MEM:
27880 case X86::VPCMPISTRM128MEM:
27881 case X86::PCMPESTRM128REG:
27882 case X86::VPCMPESTRM128REG:
27883 case X86::PCMPESTRM128MEM:
27884 case X86::VPCMPESTRM128MEM:
27885 assert(Subtarget.hasSSE42() &&
27886 "Target must have SSE4.2 or AVX features enabled");
27887 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
27889 // String/text processing lowering.
27890 case X86::PCMPISTRIREG:
27891 case X86::VPCMPISTRIREG:
27892 case X86::PCMPISTRIMEM:
27893 case X86::VPCMPISTRIMEM:
27894 case X86::PCMPESTRIREG:
27895 case X86::VPCMPESTRIREG:
27896 case X86::PCMPESTRIMEM:
27897 case X86::VPCMPESTRIMEM:
27898 assert(Subtarget.hasSSE42() &&
27899 "Target must have SSE4.2 or AVX features enabled");
27900 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
27902 // Thread synchronization.
27904 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
27905 case X86::MONITORX:
27906 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
27910 return emitClzero(&MI, BB, Subtarget);
27914 return emitWRPKRU(MI, BB, Subtarget);
27916 return emitRDPKRU(MI, BB, Subtarget);
27919 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
27921 case X86::VASTART_SAVE_XMM_REGS:
27922 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
27924 case X86::VAARG_64:
27925 return EmitVAARG64WithCustomInserter(MI, BB);
27927 case X86::EH_SjLj_SetJmp32:
27928 case X86::EH_SjLj_SetJmp64:
27929 return emitEHSjLjSetJmp(MI, BB);
27931 case X86::EH_SjLj_LongJmp32:
27932 case X86::EH_SjLj_LongJmp64:
27933 return emitEHSjLjLongJmp(MI, BB);
27935 case X86::Int_eh_sjlj_setup_dispatch:
27936 return EmitSjLjDispatchBlock(MI, BB);
27938 case TargetOpcode::STATEPOINT:
27939 // As an implementation detail, STATEPOINT shares the STACKMAP format at
27940 // this point in the process. We diverge later.
27941 return emitPatchPoint(MI, BB);
27943 case TargetOpcode::STACKMAP:
27944 case TargetOpcode::PATCHPOINT:
27945 return emitPatchPoint(MI, BB);
27947 case TargetOpcode::PATCHABLE_EVENT_CALL:
27948 // Do nothing here, handle in xray instrumentation pass.
27951 case X86::LCMPXCHG8B: {
27952 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
27953 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
27954 // requires a memory operand. If it happens that current architecture is
27955 // i686 and for current function we need a base pointer
27956 // - which is ESI for i686 - register allocator would not be able to
27957 // allocate registers for an address in form of X(%reg, %reg, Y)
27958 // - there never would be enough unreserved registers during regalloc
27959 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
27960 // We are giving a hand to register allocator by precomputing the address in
27961 // a new vreg using LEA.
27963 // If it is not i686 or there is no base pointer - nothing to do here.
27964 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
27967 // Even though this code does not necessarily needs the base pointer to
27968 // be ESI, we check for that. The reason: if this assert fails, there are
27969 // some changes happened in the compiler base pointer handling, which most
27970 // probably have to be addressed somehow here.
27971 assert(TRI->getBaseRegister() == X86::ESI &&
27972 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
27973 "base pointer in mind");
27975 MachineRegisterInfo &MRI = MF->getRegInfo();
27976 MVT SPTy = getPointerTy(MF->getDataLayout());
27977 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27978 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
27980 X86AddressMode AM = getAddressFromInstr(&MI, 0);
27981 // Regalloc does not need any help when the memory operand of CMPXCHG8B
27982 // does not use index register.
27983 if (AM.IndexReg == X86::NoRegister)
27986 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
27987 // four operand definitions that are E[ABCD] registers. We skip them and
27988 // then insert the LEA.
27989 MachineBasicBlock::iterator MBBI(MI);
27990 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
27991 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
27994 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
27996 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
28000 case X86::LCMPXCHG16B:
28002 case X86::LCMPXCHG8B_SAVE_EBX:
28003 case X86::LCMPXCHG16B_SAVE_RBX: {
28005 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
28006 if (!BB->isLiveIn(BasePtr))
28007 BB->addLiveIn(BasePtr);
28013 //===----------------------------------------------------------------------===//
28014 // X86 Optimization Hooks
28015 //===----------------------------------------------------------------------===//
28017 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
28019 const APInt &DemandedElts,
28020 const SelectionDAG &DAG,
28021 unsigned Depth) const {
28022 unsigned BitWidth = Known.getBitWidth();
28023 unsigned Opc = Op.getOpcode();
28024 EVT VT = Op.getValueType();
28025 assert((Opc >= ISD::BUILTIN_OP_END ||
28026 Opc == ISD::INTRINSIC_WO_CHAIN ||
28027 Opc == ISD::INTRINSIC_W_CHAIN ||
28028 Opc == ISD::INTRINSIC_VOID) &&
28029 "Should use MaskedValueIsZero if you don't know whether Op"
28030 " is a target node!");
28035 case X86ISD::SETCC:
28036 Known.Zero.setBitsFrom(1);
28038 case X86ISD::MOVMSK: {
28039 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
28040 Known.Zero.setBitsFrom(NumLoBits);
28043 case X86ISD::PEXTRB:
28044 case X86ISD::PEXTRW: {
28045 SDValue Src = Op.getOperand(0);
28046 EVT SrcVT = Src.getValueType();
28047 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
28048 Op.getConstantOperandVal(1));
28049 DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
28050 Known = Known.zextOrTrunc(BitWidth);
28051 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
28054 case X86ISD::VSHLI:
28055 case X86ISD::VSRLI: {
28056 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
28057 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
28058 Known.setAllZero();
28062 DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
28063 unsigned ShAmt = ShiftImm->getZExtValue();
28064 if (Opc == X86ISD::VSHLI) {
28065 Known.Zero <<= ShAmt;
28066 Known.One <<= ShAmt;
28067 // Low bits are known zero.
28068 Known.Zero.setLowBits(ShAmt);
28070 Known.Zero.lshrInPlace(ShAmt);
28071 Known.One.lshrInPlace(ShAmt);
28072 // High bits are known zero.
28073 Known.Zero.setHighBits(ShAmt);
28078 case X86ISD::VZEXT: {
28079 // TODO: Add DemandedElts support.
28080 SDValue N0 = Op.getOperand(0);
28081 unsigned NumElts = VT.getVectorNumElements();
28083 EVT SrcVT = N0.getValueType();
28084 unsigned InNumElts = SrcVT.getVectorNumElements();
28085 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
28086 assert(InNumElts >= NumElts && "Illegal VZEXT input");
28088 Known = KnownBits(InBitWidth);
28089 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
28090 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
28091 Known = Known.zext(BitWidth);
28092 Known.Zero.setBitsFrom(InBitWidth);
28095 case X86ISD::CMOV: {
28096 DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
28097 // If we don't know any bits, early out.
28098 if (Known.isUnknown())
28101 DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
28103 // Only known if known in both the LHS and RHS.
28104 Known.One &= Known2.One;
28105 Known.Zero &= Known2.Zero;
28108 case X86ISD::UDIVREM8_ZEXT_HREG:
28109 // TODO: Support more than just the zero extended bits?
28110 if (Op.getResNo() != 1)
28112 // The remainder is zero extended.
28113 Known.Zero.setBitsFrom(8);
28118 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
28119 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
28120 unsigned Depth) const {
28121 unsigned VTBits = Op.getScalarValueSizeInBits();
28122 unsigned Opcode = Op.getOpcode();
28124 case X86ISD::SETCC_CARRY:
28125 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
28128 case X86ISD::VSEXT: {
28129 // TODO: Add DemandedElts support.
28130 SDValue Src = Op.getOperand(0);
28131 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
28132 Tmp += VTBits - Src.getScalarValueSizeInBits();
28136 case X86ISD::VTRUNC: {
28137 // TODO: Add DemandedElts support.
28138 SDValue Src = Op.getOperand(0);
28139 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
28140 assert(VTBits < NumSrcBits && "Illegal truncation input type");
28141 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
28142 if (Tmp > (NumSrcBits - VTBits))
28143 return Tmp - (NumSrcBits - VTBits);
28147 case X86ISD::PACKSS: {
28148 // PACKSS is just a truncation if the sign bits extend to the packed size.
28149 // TODO: Add DemandedElts support.
28150 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
28151 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
28152 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
28153 unsigned Tmp = std::min(Tmp0, Tmp1);
28154 if (Tmp > (SrcBits - VTBits))
28155 return Tmp - (SrcBits - VTBits);
28159 case X86ISD::VSHLI: {
28160 SDValue Src = Op.getOperand(0);
28161 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
28162 if (ShiftVal.uge(VTBits))
28163 return VTBits; // Shifted all bits out --> zero.
28164 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
28165 if (ShiftVal.uge(Tmp))
28166 return 1; // Shifted all sign bits out --> unknown.
28167 return Tmp - ShiftVal.getZExtValue();
28170 case X86ISD::VSRAI: {
28171 SDValue Src = Op.getOperand(0);
28172 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
28173 if (ShiftVal.uge(VTBits - 1))
28174 return VTBits; // Sign splat.
28175 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
28177 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
28180 case X86ISD::PCMPGT:
28181 case X86ISD::PCMPEQ:
28183 case X86ISD::VPCOM:
28184 case X86ISD::VPCOMU:
28185 // Vector compares return zero/all-bits result values.
28188 case X86ISD::CMOV: {
28189 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
28190 if (Tmp0 == 1) return 1; // Early out.
28191 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
28192 return std::min(Tmp0, Tmp1);
28194 case X86ISD::SDIVREM8_SEXT_HREG:
28195 // TODO: Support more than just the sign extended bits?
28196 if (Op.getResNo() != 1)
28198 // The remainder is sign extended.
28206 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
28207 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
28208 return N->getOperand(0);
28212 /// Returns true (and the GlobalValue and the offset) if the node is a
28213 /// GlobalAddress + offset.
28214 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
28215 const GlobalValue* &GA,
28216 int64_t &Offset) const {
28217 if (N->getOpcode() == X86ISD::Wrapper) {
28218 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
28219 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
28220 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
28224 return TargetLowering::isGAPlusOffset(N, GA, Offset);
28227 // Attempt to match a combined shuffle mask against supported unary shuffle
28229 // TODO: Investigate sharing more of this with shuffle lowering.
28230 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28231 bool AllowFloatDomain, bool AllowIntDomain,
28232 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
28233 const X86Subtarget &Subtarget,
28234 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
28235 unsigned NumMaskElts = Mask.size();
28236 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
28238 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
28239 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
28240 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
28241 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
28242 unsigned MaxScale = 64 / MaskEltSize;
28243 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
28245 unsigned NumDstElts = NumMaskElts / Scale;
28246 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
28247 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
28248 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
28251 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
28252 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
28253 MVT::getIntegerVT(MaskEltSize);
28254 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
28256 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
28257 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
28258 Shuffle = unsigned(X86ISD::VZEXT);
28260 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
28262 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
28263 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
28269 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
28270 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
28271 isUndefOrEqual(Mask[0], 0) &&
28272 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
28273 Shuffle = X86ISD::VZEXT_MOVL;
28274 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
28278 // Check if we have SSE3 which will let us use MOVDDUP etc. The
28279 // instructions are no slower than UNPCKLPD but has the option to
28280 // fold the input operand into even an unaligned memory load.
28281 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
28282 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
28283 Shuffle = X86ISD::MOVDDUP;
28284 SrcVT = DstVT = MVT::v2f64;
28287 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
28288 Shuffle = X86ISD::MOVSLDUP;
28289 SrcVT = DstVT = MVT::v4f32;
28292 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
28293 Shuffle = X86ISD::MOVSHDUP;
28294 SrcVT = DstVT = MVT::v4f32;
28299 if (MaskVT.is256BitVector() && AllowFloatDomain) {
28300 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
28301 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
28302 Shuffle = X86ISD::MOVDDUP;
28303 SrcVT = DstVT = MVT::v4f64;
28306 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
28307 Shuffle = X86ISD::MOVSLDUP;
28308 SrcVT = DstVT = MVT::v8f32;
28311 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
28312 Shuffle = X86ISD::MOVSHDUP;
28313 SrcVT = DstVT = MVT::v8f32;
28318 if (MaskVT.is512BitVector() && AllowFloatDomain) {
28319 assert(Subtarget.hasAVX512() &&
28320 "AVX512 required for 512-bit vector shuffles");
28321 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
28322 Shuffle = X86ISD::MOVDDUP;
28323 SrcVT = DstVT = MVT::v8f64;
28326 if (isTargetShuffleEquivalent(
28327 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
28328 Shuffle = X86ISD::MOVSLDUP;
28329 SrcVT = DstVT = MVT::v16f32;
28332 if (isTargetShuffleEquivalent(
28333 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
28334 Shuffle = X86ISD::MOVSHDUP;
28335 SrcVT = DstVT = MVT::v16f32;
28340 // Attempt to match against broadcast-from-vector.
28341 if (Subtarget.hasAVX2()) {
28342 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
28343 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
28344 SrcVT = DstVT = MaskVT;
28345 Shuffle = X86ISD::VBROADCAST;
28353 // Attempt to match a combined shuffle mask against supported unary immediate
28354 // permute instructions.
28355 // TODO: Investigate sharing more of this with shuffle lowering.
28356 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28357 const APInt &Zeroable,
28358 bool AllowFloatDomain,
28359 bool AllowIntDomain,
28360 const X86Subtarget &Subtarget,
28361 unsigned &Shuffle, MVT &ShuffleVT,
28362 unsigned &PermuteImm) {
28363 unsigned NumMaskElts = Mask.size();
28364 unsigned InputSizeInBits = MaskVT.getSizeInBits();
28365 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
28366 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
28368 bool ContainsZeros =
28369 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
28371 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
28372 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
28373 // Check for lane crossing permutes.
28374 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
28375 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
28376 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
28377 Shuffle = X86ISD::VPERMI;
28378 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
28379 PermuteImm = getV4X86ShuffleImm(Mask);
28382 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
28383 SmallVector<int, 4> RepeatedMask;
28384 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
28385 Shuffle = X86ISD::VPERMI;
28386 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
28387 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
28391 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
28392 // VPERMILPD can permute with a non-repeating shuffle.
28393 Shuffle = X86ISD::VPERMILPI;
28394 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
28396 for (int i = 0, e = Mask.size(); i != e; ++i) {
28398 if (M == SM_SentinelUndef)
28400 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
28401 PermuteImm |= (M & 1) << i;
28407 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
28408 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
28409 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
28410 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
28411 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
28412 SmallVector<int, 4> RepeatedMask;
28413 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
28414 // Narrow the repeated mask to create 32-bit element permutes.
28415 SmallVector<int, 4> WordMask = RepeatedMask;
28416 if (MaskScalarSizeInBits == 64)
28417 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
28419 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
28420 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
28421 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
28422 PermuteImm = getV4X86ShuffleImm(WordMask);
28427 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
28428 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
28429 SmallVector<int, 4> RepeatedMask;
28430 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
28431 ArrayRef<int> LoMask(Mask.data() + 0, 4);
28432 ArrayRef<int> HiMask(Mask.data() + 4, 4);
28434 // PSHUFLW: permute lower 4 elements only.
28435 if (isUndefOrInRange(LoMask, 0, 4) &&
28436 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
28437 Shuffle = X86ISD::PSHUFLW;
28438 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
28439 PermuteImm = getV4X86ShuffleImm(LoMask);
28443 // PSHUFHW: permute upper 4 elements only.
28444 if (isUndefOrInRange(HiMask, 4, 8) &&
28445 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
28446 // Offset the HiMask so that we can create the shuffle immediate.
28447 int OffsetHiMask[4];
28448 for (int i = 0; i != 4; ++i)
28449 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
28451 Shuffle = X86ISD::PSHUFHW;
28452 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
28453 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
28459 // Attempt to match against byte/bit shifts.
28460 // FIXME: Add 512-bit support.
28461 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28462 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28463 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
28464 MaskScalarSizeInBits, Mask,
28465 0, Zeroable, Subtarget);
28466 if (0 < ShiftAmt) {
28467 PermuteImm = (unsigned)ShiftAmt;
28475 // Attempt to match a combined unary shuffle mask against supported binary
28476 // shuffle instructions.
28477 // TODO: Investigate sharing more of this with shuffle lowering.
28478 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28479 bool AllowFloatDomain, bool AllowIntDomain,
28480 SDValue &V1, SDValue &V2, SDLoc &DL,
28482 const X86Subtarget &Subtarget,
28483 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
28485 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28487 if (MaskVT.is128BitVector()) {
28488 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
28490 Shuffle = X86ISD::MOVLHPS;
28491 SrcVT = DstVT = MVT::v4f32;
28494 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
28496 Shuffle = X86ISD::MOVHLPS;
28497 SrcVT = DstVT = MVT::v4f32;
28500 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
28501 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28503 Shuffle = X86ISD::MOVSD;
28504 SrcVT = DstVT = MaskVT;
28507 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
28508 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28509 Shuffle = X86ISD::MOVSS;
28510 SrcVT = DstVT = MaskVT;
28515 // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
28516 // TODO add support for 256/512-bit types.
28517 if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
28518 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
28525 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
28526 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
28527 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28528 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
28529 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
28530 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
28531 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
28533 SrcVT = DstVT = MaskVT;
28534 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
28535 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
28543 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28544 const APInt &Zeroable,
28545 bool AllowFloatDomain,
28546 bool AllowIntDomain,
28547 SDValue &V1, SDValue &V2, SDLoc &DL,
28549 const X86Subtarget &Subtarget,
28550 unsigned &Shuffle, MVT &ShuffleVT,
28551 unsigned &PermuteImm) {
28552 unsigned NumMaskElts = Mask.size();
28553 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28555 // Attempt to match against PALIGNR byte rotate.
28556 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
28557 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28558 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
28559 if (0 < ByteRotation) {
28560 Shuffle = X86ISD::PALIGNR;
28561 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
28562 PermuteImm = ByteRotation;
28567 // Attempt to combine to X86ISD::BLENDI.
28568 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
28569 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
28570 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
28571 uint64_t BlendMask = 0;
28572 bool ForceV1Zero = false, ForceV2Zero = false;
28573 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
28574 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
28576 if (MaskVT == MVT::v16i16) {
28577 // We can only use v16i16 PBLENDW if the lanes are repeated.
28578 SmallVector<int, 8> RepeatedMask;
28579 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
28581 assert(RepeatedMask.size() == 8 &&
28582 "Repeated mask size doesn't match!");
28584 for (int i = 0; i < 8; ++i)
28585 if (RepeatedMask[i] >= 8)
28586 PermuteImm |= 1 << i;
28587 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28588 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28589 Shuffle = X86ISD::BLENDI;
28590 ShuffleVT = MaskVT;
28594 // Determine a type compatible with X86ISD::BLENDI.
28595 ShuffleVT = MaskVT;
28596 if (Subtarget.hasAVX2()) {
28597 if (ShuffleVT == MVT::v4i64)
28598 ShuffleVT = MVT::v8i32;
28599 else if (ShuffleVT == MVT::v2i64)
28600 ShuffleVT = MVT::v4i32;
28602 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
28603 ShuffleVT = MVT::v8i16;
28604 else if (ShuffleVT == MVT::v4i64)
28605 ShuffleVT = MVT::v4f64;
28606 else if (ShuffleVT == MVT::v8i32)
28607 ShuffleVT = MVT::v8f32;
28610 if (!ShuffleVT.isFloatingPoint()) {
28611 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
28613 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
28614 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
28615 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
28618 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28619 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28620 PermuteImm = (unsigned)BlendMask;
28621 Shuffle = X86ISD::BLENDI;
28627 // Attempt to combine to INSERTPS.
28628 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
28629 MaskVT.is128BitVector()) {
28630 if (Zeroable.getBoolValue() &&
28631 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
28632 Shuffle = X86ISD::INSERTPS;
28633 ShuffleVT = MVT::v4f32;
28638 // Attempt to combine to SHUFPD.
28639 if (AllowFloatDomain && EltSizeInBits == 64 &&
28640 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28641 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28642 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28643 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
28644 Shuffle = X86ISD::SHUFP;
28645 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
28650 // Attempt to combine to SHUFPS.
28651 if (AllowFloatDomain && EltSizeInBits == 32 &&
28652 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
28653 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28654 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28655 SmallVector<int, 4> RepeatedMask;
28656 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
28657 // Match each half of the repeated mask, to determine if its just
28658 // referencing one of the vectors, is zeroable or entirely undef.
28659 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
28660 int M0 = RepeatedMask[Offset];
28661 int M1 = RepeatedMask[Offset + 1];
28663 if (isUndefInRange(RepeatedMask, Offset, 2)) {
28664 return DAG.getUNDEF(MaskVT);
28665 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
28666 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
28667 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
28668 return getZeroVector(MaskVT, Subtarget, DAG, DL);
28669 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
28670 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
28671 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
28673 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
28674 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
28675 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
28682 int ShufMask[4] = {-1, -1, -1, -1};
28683 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
28684 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
28689 Shuffle = X86ISD::SHUFP;
28690 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
28691 PermuteImm = getV4X86ShuffleImm(ShufMask);
28700 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
28703 /// This is the leaf of the recursive combine below. When we have found some
28704 /// chain of single-use x86 shuffle instructions and accumulated the combined
28705 /// shuffle mask represented by them, this will try to pattern match that mask
28706 /// into either a single instruction if there is a special purpose instruction
28707 /// for this operation, or into a PSHUFB instruction which is a fully general
28708 /// instruction but should only be used to replace chains over a certain depth.
28709 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
28710 ArrayRef<int> BaseMask, int Depth,
28711 bool HasVariableMask, SelectionDAG &DAG,
28712 TargetLowering::DAGCombinerInfo &DCI,
28713 const X86Subtarget &Subtarget) {
28714 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
28715 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
28716 "Unexpected number of shuffle inputs!");
28718 // Find the inputs that enter the chain. Note that multiple uses are OK
28719 // here, we're not going to remove the operands we find.
28720 bool UnaryShuffle = (Inputs.size() == 1);
28721 SDValue V1 = peekThroughBitcasts(Inputs[0]);
28722 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
28723 : peekThroughBitcasts(Inputs[1]));
28725 MVT VT1 = V1.getSimpleValueType();
28726 MVT VT2 = V2.getSimpleValueType();
28727 MVT RootVT = Root.getSimpleValueType();
28728 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
28729 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
28730 "Vector size mismatch");
28735 unsigned NumBaseMaskElts = BaseMask.size();
28736 if (NumBaseMaskElts == 1) {
28737 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
28738 return DAG.getBitcast(RootVT, V1);
28741 unsigned RootSizeInBits = RootVT.getSizeInBits();
28742 unsigned NumRootElts = RootVT.getVectorNumElements();
28743 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
28744 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
28745 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
28747 // Don't combine if we are a AVX512/EVEX target and the mask element size
28748 // is different from the root element size - this would prevent writemasks
28749 // from being reused.
28750 // TODO - this currently prevents all lane shuffles from occurring.
28751 // TODO - check for writemasks usage instead of always preventing combining.
28752 // TODO - attempt to narrow Mask back to writemask size.
28753 bool IsEVEXShuffle =
28754 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
28756 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
28758 // Handle 128-bit lane shuffles of 256-bit vectors.
28759 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
28760 // we need to use the zeroing feature.
28761 // TODO - this should support binary shuffles.
28762 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
28763 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
28764 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
28765 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
28766 return SDValue(); // Nothing to do!
28767 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
28768 unsigned PermMask = 0;
28769 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
28770 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
28772 Res = DAG.getBitcast(ShuffleVT, V1);
28773 DCI.AddToWorklist(Res.getNode());
28774 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
28775 DAG.getUNDEF(ShuffleVT),
28776 DAG.getConstant(PermMask, DL, MVT::i8));
28777 DCI.AddToWorklist(Res.getNode());
28778 return DAG.getBitcast(RootVT, Res);
28781 // For masks that have been widened to 128-bit elements or more,
28782 // narrow back down to 64-bit elements.
28783 SmallVector<int, 64> Mask;
28784 if (BaseMaskEltSizeInBits > 64) {
28785 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
28786 int MaskScale = BaseMaskEltSizeInBits / 64;
28787 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
28789 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
28792 unsigned NumMaskElts = Mask.size();
28793 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
28795 // Determine the effective mask value type.
28796 FloatDomain &= (32 <= MaskEltSizeInBits);
28797 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
28798 : MVT::getIntegerVT(MaskEltSizeInBits);
28799 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
28801 // Only allow legal mask types.
28802 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
28805 // Attempt to match the mask against known shuffle patterns.
28806 MVT ShuffleSrcVT, ShuffleVT;
28807 unsigned Shuffle, PermuteImm;
28809 // Which shuffle domains are permitted?
28810 // Permit domain crossing at higher combine depths.
28811 bool AllowFloatDomain = FloatDomain || (Depth > 3);
28812 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
28813 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
28815 // Determine zeroable mask elements.
28816 APInt Zeroable(NumMaskElts, 0);
28817 for (unsigned i = 0; i != NumMaskElts; ++i)
28818 if (isUndefOrZero(Mask[i]))
28819 Zeroable.setBit(i);
28821 if (UnaryShuffle) {
28822 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
28823 // directly if we don't shuffle the lower element and we shuffle the upper
28824 // (zero) elements within themselves.
28825 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
28826 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
28827 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
28828 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
28829 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
28830 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
28831 return DAG.getBitcast(RootVT, V1);
28835 SDValue NewV1 = V1; // Save operand in case early exit happens.
28836 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
28837 NewV1, DL, DAG, Subtarget, Shuffle,
28838 ShuffleSrcVT, ShuffleVT) &&
28839 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
28840 if (Depth == 1 && Root.getOpcode() == Shuffle)
28841 return SDValue(); // Nothing to do!
28842 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
28843 DCI.AddToWorklist(Res.getNode());
28844 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
28845 DCI.AddToWorklist(Res.getNode());
28846 return DAG.getBitcast(RootVT, Res);
28849 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
28850 AllowIntDomain, Subtarget, Shuffle,
28851 ShuffleVT, PermuteImm) &&
28852 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
28853 if (Depth == 1 && Root.getOpcode() == Shuffle)
28854 return SDValue(); // Nothing to do!
28855 Res = DAG.getBitcast(ShuffleVT, V1);
28856 DCI.AddToWorklist(Res.getNode());
28857 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
28858 DAG.getConstant(PermuteImm, DL, MVT::i8));
28859 DCI.AddToWorklist(Res.getNode());
28860 return DAG.getBitcast(RootVT, Res);
28864 SDValue NewV1 = V1; // Save operands in case early exit happens.
28865 SDValue NewV2 = V2;
28866 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
28867 NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
28868 ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
28869 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
28870 if (Depth == 1 && Root.getOpcode() == Shuffle)
28871 return SDValue(); // Nothing to do!
28872 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
28873 DCI.AddToWorklist(NewV1.getNode());
28874 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
28875 DCI.AddToWorklist(NewV2.getNode());
28876 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
28877 DCI.AddToWorklist(Res.getNode());
28878 return DAG.getBitcast(RootVT, Res);
28881 NewV1 = V1; // Save operands in case early exit happens.
28883 if (matchBinaryPermuteVectorShuffle(
28884 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
28885 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
28886 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
28887 if (Depth == 1 && Root.getOpcode() == Shuffle)
28888 return SDValue(); // Nothing to do!
28889 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
28890 DCI.AddToWorklist(NewV1.getNode());
28891 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
28892 DCI.AddToWorklist(NewV2.getNode());
28893 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
28894 DAG.getConstant(PermuteImm, DL, MVT::i8));
28895 DCI.AddToWorklist(Res.getNode());
28896 return DAG.getBitcast(RootVT, Res);
28899 // Typically from here on, we need an integer version of MaskVT.
28900 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
28901 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
28903 // Annoyingly, SSE4A instructions don't map into the above match helpers.
28904 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
28905 uint64_t BitLen, BitIdx;
28906 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
28908 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
28909 return SDValue(); // Nothing to do!
28910 V1 = DAG.getBitcast(IntMaskVT, V1);
28911 DCI.AddToWorklist(V1.getNode());
28912 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
28913 DAG.getConstant(BitLen, DL, MVT::i8),
28914 DAG.getConstant(BitIdx, DL, MVT::i8));
28915 DCI.AddToWorklist(Res.getNode());
28916 return DAG.getBitcast(RootVT, Res);
28919 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
28920 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
28921 return SDValue(); // Nothing to do!
28922 V1 = DAG.getBitcast(IntMaskVT, V1);
28923 DCI.AddToWorklist(V1.getNode());
28924 V2 = DAG.getBitcast(IntMaskVT, V2);
28925 DCI.AddToWorklist(V2.getNode());
28926 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
28927 DAG.getConstant(BitLen, DL, MVT::i8),
28928 DAG.getConstant(BitIdx, DL, MVT::i8));
28929 DCI.AddToWorklist(Res.getNode());
28930 return DAG.getBitcast(RootVT, Res);
28934 // Don't try to re-form single instruction chains under any circumstances now
28935 // that we've done encoding canonicalization for them.
28939 // Depth threshold above which we can efficiently use variable mask shuffles.
28940 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
28941 bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
28943 bool MaskContainsZeros =
28944 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
28946 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
28947 // If we have a single input lane-crossing shuffle then lower to VPERMV.
28948 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
28949 ((Subtarget.hasAVX2() &&
28950 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28951 (Subtarget.hasAVX512() &&
28952 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28953 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28954 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28955 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28956 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28957 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28958 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28959 DCI.AddToWorklist(VPermMask.getNode());
28960 Res = DAG.getBitcast(MaskVT, V1);
28961 DCI.AddToWorklist(Res.getNode());
28962 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
28963 DCI.AddToWorklist(Res.getNode());
28964 return DAG.getBitcast(RootVT, Res);
28967 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
28968 // vector as the second source.
28969 if (UnaryShuffle && AllowVariableMask &&
28970 ((Subtarget.hasAVX512() &&
28971 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28972 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28973 (Subtarget.hasVLX() &&
28974 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
28975 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28976 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28977 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28978 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28979 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28980 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
28981 for (unsigned i = 0; i != NumMaskElts; ++i)
28982 if (Mask[i] == SM_SentinelZero)
28983 Mask[i] = NumMaskElts + i;
28985 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28986 DCI.AddToWorklist(VPermMask.getNode());
28987 Res = DAG.getBitcast(MaskVT, V1);
28988 DCI.AddToWorklist(Res.getNode());
28989 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
28990 DCI.AddToWorklist(Zero.getNode());
28991 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
28992 DCI.AddToWorklist(Res.getNode());
28993 return DAG.getBitcast(RootVT, Res);
28996 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
28997 if (AllowVariableMask && !MaskContainsZeros &&
28998 ((Subtarget.hasAVX512() &&
28999 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29000 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29001 (Subtarget.hasVLX() &&
29002 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29003 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29004 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29005 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29006 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29007 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29008 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29009 DCI.AddToWorklist(VPermMask.getNode());
29010 V1 = DAG.getBitcast(MaskVT, V1);
29011 DCI.AddToWorklist(V1.getNode());
29012 V2 = DAG.getBitcast(MaskVT, V2);
29013 DCI.AddToWorklist(V2.getNode());
29014 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
29015 DCI.AddToWorklist(Res.getNode());
29016 return DAG.getBitcast(RootVT, Res);
29021 // See if we can combine a single input shuffle with zeros to a bit-mask,
29022 // which is much simpler than any shuffle.
29023 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
29024 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
29025 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
29026 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
29027 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
29028 APInt UndefElts(NumMaskElts, 0);
29029 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
29030 for (unsigned i = 0; i != NumMaskElts; ++i) {
29032 if (M == SM_SentinelUndef) {
29033 UndefElts.setBit(i);
29036 if (M == SM_SentinelZero)
29038 EltBits[i] = AllOnes;
29040 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
29041 DCI.AddToWorklist(BitMask.getNode());
29042 Res = DAG.getBitcast(MaskVT, V1);
29043 DCI.AddToWorklist(Res.getNode());
29044 unsigned AndOpcode =
29045 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
29046 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
29047 DCI.AddToWorklist(Res.getNode());
29048 return DAG.getBitcast(RootVT, Res);
29051 // If we have a single input shuffle with different shuffle patterns in the
29052 // the 128-bit lanes use the variable mask to VPERMILPS.
29053 // TODO Combine other mask types at higher depths.
29054 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29055 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
29056 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
29057 SmallVector<SDValue, 16> VPermIdx;
29058 for (int M : Mask) {
29060 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
29061 VPermIdx.push_back(Idx);
29063 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
29064 DCI.AddToWorklist(VPermMask.getNode());
29065 Res = DAG.getBitcast(MaskVT, V1);
29066 DCI.AddToWorklist(Res.getNode());
29067 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
29068 DCI.AddToWorklist(Res.getNode());
29069 return DAG.getBitcast(RootVT, Res);
29072 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
29073 // to VPERMIL2PD/VPERMIL2PS.
29074 if (AllowVariableMask && Subtarget.hasXOP() &&
29075 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
29076 MaskVT == MVT::v8f32)) {
29077 // VPERMIL2 Operation.
29078 // Bits[3] - Match Bit.
29079 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
29080 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
29081 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
29082 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
29083 SmallVector<int, 8> VPerm2Idx;
29084 unsigned M2ZImm = 0;
29085 for (int M : Mask) {
29086 if (M == SM_SentinelUndef) {
29087 VPerm2Idx.push_back(-1);
29090 if (M == SM_SentinelZero) {
29092 VPerm2Idx.push_back(8);
29095 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
29096 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
29097 VPerm2Idx.push_back(Index);
29099 V1 = DAG.getBitcast(MaskVT, V1);
29100 DCI.AddToWorklist(V1.getNode());
29101 V2 = DAG.getBitcast(MaskVT, V2);
29102 DCI.AddToWorklist(V2.getNode());
29103 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
29104 DCI.AddToWorklist(VPerm2MaskOp.getNode());
29105 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
29106 DAG.getConstant(M2ZImm, DL, MVT::i8));
29107 DCI.AddToWorklist(Res.getNode());
29108 return DAG.getBitcast(RootVT, Res);
29111 // If we have 3 or more shuffle instructions or a chain involving a variable
29112 // mask, we can replace them with a single PSHUFB instruction profitably.
29113 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
29114 // instructions, but in practice PSHUFB tends to be *very* fast so we're
29115 // more aggressive.
29116 if (UnaryShuffle && AllowVariableMask &&
29117 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29118 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
29119 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
29120 SmallVector<SDValue, 16> PSHUFBMask;
29121 int NumBytes = RootVT.getSizeInBits() / 8;
29122 int Ratio = NumBytes / NumMaskElts;
29123 for (int i = 0; i < NumBytes; ++i) {
29124 int M = Mask[i / Ratio];
29125 if (M == SM_SentinelUndef) {
29126 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
29129 if (M == SM_SentinelZero) {
29130 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
29133 M = Ratio * M + i % Ratio;
29134 assert((M / 16) == (i / 16) && "Lane crossing detected");
29135 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
29137 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
29138 Res = DAG.getBitcast(ByteVT, V1);
29139 DCI.AddToWorklist(Res.getNode());
29140 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
29141 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
29142 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
29143 DCI.AddToWorklist(Res.getNode());
29144 return DAG.getBitcast(RootVT, Res);
29147 // With XOP, if we have a 128-bit binary input shuffle we can always combine
29148 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
29149 // slower than PSHUFB on targets that support both.
29150 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
29151 // VPPERM Mask Operation
29152 // Bits[4:0] - Byte Index (0 - 31)
29153 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
29154 SmallVector<SDValue, 16> VPPERMMask;
29156 int Ratio = NumBytes / NumMaskElts;
29157 for (int i = 0; i < NumBytes; ++i) {
29158 int M = Mask[i / Ratio];
29159 if (M == SM_SentinelUndef) {
29160 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
29163 if (M == SM_SentinelZero) {
29164 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
29167 M = Ratio * M + i % Ratio;
29168 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
29170 MVT ByteVT = MVT::v16i8;
29171 V1 = DAG.getBitcast(ByteVT, V1);
29172 DCI.AddToWorklist(V1.getNode());
29173 V2 = DAG.getBitcast(ByteVT, V2);
29174 DCI.AddToWorklist(V2.getNode());
29175 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
29176 DCI.AddToWorklist(VPPERMMaskOp.getNode());
29177 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
29178 DCI.AddToWorklist(Res.getNode());
29179 return DAG.getBitcast(RootVT, Res);
29182 // Failed to find any combines.
29186 // Attempt to constant fold all of the constant source ops.
29187 // Returns true if the entire shuffle is folded to a constant.
29188 // TODO: Extend this to merge multiple constant Ops and update the mask.
29189 static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
29190 ArrayRef<int> Mask, SDValue Root,
29191 bool HasVariableMask,
29193 TargetLowering::DAGCombinerInfo &DCI,
29194 const X86Subtarget &Subtarget) {
29195 MVT VT = Root.getSimpleValueType();
29197 unsigned SizeInBits = VT.getSizeInBits();
29198 unsigned NumMaskElts = Mask.size();
29199 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
29200 unsigned NumOps = Ops.size();
29202 // Extract constant bits from each source op.
29203 bool OneUseConstantOp = false;
29204 SmallVector<APInt, 16> UndefEltsOps(NumOps);
29205 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
29206 for (unsigned i = 0; i != NumOps; ++i) {
29207 SDValue SrcOp = Ops[i];
29208 OneUseConstantOp |= SrcOp.hasOneUse();
29209 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
29214 // Only fold if at least one of the constants is only used once or
29215 // the combined shuffle has included a variable mask shuffle, this
29216 // is to avoid constant pool bloat.
29217 if (!OneUseConstantOp && !HasVariableMask)
29220 // Shuffle the constant bits according to the mask.
29221 APInt UndefElts(NumMaskElts, 0);
29222 APInt ZeroElts(NumMaskElts, 0);
29223 APInt ConstantElts(NumMaskElts, 0);
29224 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
29225 APInt::getNullValue(MaskSizeInBits));
29226 for (unsigned i = 0; i != NumMaskElts; ++i) {
29228 if (M == SM_SentinelUndef) {
29229 UndefElts.setBit(i);
29231 } else if (M == SM_SentinelZero) {
29232 ZeroElts.setBit(i);
29235 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
29237 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
29238 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
29240 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
29241 if (SrcUndefElts[SrcMaskIdx]) {
29242 UndefElts.setBit(i);
29246 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
29247 APInt &Bits = SrcEltBits[SrcMaskIdx];
29249 ZeroElts.setBit(i);
29253 ConstantElts.setBit(i);
29254 ConstantBitData[i] = Bits;
29256 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
29258 // Create the constant data.
29260 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
29261 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
29263 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
29265 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
29268 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
29269 DCI.AddToWorklist(CstOp.getNode());
29270 return DAG.getBitcast(VT, CstOp);
29273 /// \brief Fully generic combining of x86 shuffle instructions.
29275 /// This should be the last combine run over the x86 shuffle instructions. Once
29276 /// they have been fully optimized, this will recursively consider all chains
29277 /// of single-use shuffle instructions, build a generic model of the cumulative
29278 /// shuffle operation, and check for simpler instructions which implement this
29279 /// operation. We use this primarily for two purposes:
29281 /// 1) Collapse generic shuffles to specialized single instructions when
29282 /// equivalent. In most cases, this is just an encoding size win, but
29283 /// sometimes we will collapse multiple generic shuffles into a single
29284 /// special-purpose shuffle.
29285 /// 2) Look for sequences of shuffle instructions with 3 or more total
29286 /// instructions, and replace them with the slightly more expensive SSSE3
29287 /// PSHUFB instruction if available. We do this as the last combining step
29288 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
29289 /// a suitable short sequence of other instructions. The PSHUFB will either
29290 /// use a register or have to read from memory and so is slightly (but only
29291 /// slightly) more expensive than the other shuffle instructions.
29293 /// Because this is inherently a quadratic operation (for each shuffle in
29294 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
29295 /// This should never be an issue in practice as the shuffle lowering doesn't
29296 /// produce sequences of more than 8 instructions.
29298 /// FIXME: We will currently miss some cases where the redundant shuffling
29299 /// would simplify under the threshold for PSHUFB formation because of
29300 /// combine-ordering. To fix this, we should do the redundant instruction
29301 /// combining in this recursive walk.
29302 static SDValue combineX86ShufflesRecursively(
29303 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
29304 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,
29305 bool HasVariableMask, SelectionDAG &DAG,
29306 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
29307 // Bound the depth of our recursive combine because this is ultimately
29308 // quadratic in nature.
29312 // Directly rip through bitcasts to find the underlying operand.
29313 SDValue Op = SrcOps[SrcOpIndex];
29314 Op = peekThroughOneUseBitcasts(Op);
29316 MVT VT = Op.getSimpleValueType();
29317 if (!VT.isVector())
29318 return SDValue(); // Bail if we hit a non-vector.
29320 assert(Root.getSimpleValueType().isVector() &&
29321 "Shuffles operate on vector types!");
29322 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
29323 "Can only combine shuffles of the same vector register size.");
29325 // Extract target shuffle mask and resolve sentinels and inputs.
29326 SmallVector<int, 64> OpMask;
29327 SmallVector<SDValue, 2> OpInputs;
29328 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
29331 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
29332 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
29333 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
29335 // Add the inputs to the Ops list, avoiding duplicates.
29336 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
29338 int InputIdx0 = -1, InputIdx1 = -1;
29339 for (int i = 0, e = Ops.size(); i < e; ++i) {
29340 SDValue BC = peekThroughBitcasts(Ops[i]);
29341 if (Input0 && BC == peekThroughBitcasts(Input0))
29343 if (Input1 && BC == peekThroughBitcasts(Input1))
29347 if (Input0 && InputIdx0 < 0) {
29348 InputIdx0 = SrcOpIndex;
29349 Ops[SrcOpIndex] = Input0;
29351 if (Input1 && InputIdx1 < 0) {
29352 InputIdx1 = Ops.size();
29353 Ops.push_back(Input1);
29356 assert(((RootMask.size() > OpMask.size() &&
29357 RootMask.size() % OpMask.size() == 0) ||
29358 (OpMask.size() > RootMask.size() &&
29359 OpMask.size() % RootMask.size() == 0) ||
29360 OpMask.size() == RootMask.size()) &&
29361 "The smaller number of elements must divide the larger.");
29363 // This function can be performance-critical, so we rely on the power-of-2
29364 // knowledge that we have about the mask sizes to replace div/rem ops with
29365 // bit-masks and shifts.
29366 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
29367 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
29368 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
29369 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
29371 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
29372 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
29373 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
29374 assert((RootRatio == 1 || OpRatio == 1) &&
29375 "Must not have a ratio for both incoming and op masks!");
29377 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
29378 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
29379 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
29380 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
29381 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
29383 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
29385 // Merge this shuffle operation's mask into our accumulated mask. Note that
29386 // this shuffle's mask will be the first applied to the input, followed by the
29387 // root mask to get us all the way to the root value arrangement. The reason
29388 // for this order is that we are recursing up the operation chain.
29389 for (unsigned i = 0; i < MaskWidth; ++i) {
29390 unsigned RootIdx = i >> RootRatioLog2;
29391 if (RootMask[RootIdx] < 0) {
29392 // This is a zero or undef lane, we're done.
29393 Mask[i] = RootMask[RootIdx];
29397 unsigned RootMaskedIdx =
29399 ? RootMask[RootIdx]
29400 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
29402 // Just insert the scaled root mask value if it references an input other
29403 // than the SrcOp we're currently inserting.
29404 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
29405 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
29406 Mask[i] = RootMaskedIdx;
29410 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
29411 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
29412 if (OpMask[OpIdx] < 0) {
29413 // The incoming lanes are zero or undef, it doesn't matter which ones we
29415 Mask[i] = OpMask[OpIdx];
29419 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
29420 unsigned OpMaskedIdx =
29423 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
29425 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
29426 if (OpMask[OpIdx] < (int)OpMask.size()) {
29427 assert(0 <= InputIdx0 && "Unknown target shuffle input");
29428 OpMaskedIdx += InputIdx0 * MaskWidth;
29430 assert(0 <= InputIdx1 && "Unknown target shuffle input");
29431 OpMaskedIdx += InputIdx1 * MaskWidth;
29434 Mask[i] = OpMaskedIdx;
29437 // Handle the all undef/zero cases early.
29438 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
29439 return DAG.getUNDEF(Root.getValueType());
29441 // TODO - should we handle the mixed zero/undef case as well? Just returning
29442 // a zero mask will lose information on undef elements possibly reducing
29443 // future combine possibilities.
29444 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
29445 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
29448 // Remove unused shuffle source ops.
29449 resolveTargetShuffleInputsAndMask(Ops, Mask);
29450 assert(!Ops.empty() && "Shuffle with no inputs detected");
29452 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
29454 // Update the list of shuffle nodes that have been combined so far.
29455 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
29457 CombinedNodes.push_back(Op.getNode());
29459 // See if we can recurse into each shuffle source op (if it's a target
29460 // shuffle). The source op should only be combined if it either has a
29461 // single use (i.e. current Op) or all its users have already been combined.
29462 for (int i = 0, e = Ops.size(); i < e; ++i)
29463 if (Ops[i].getNode()->hasOneUse() ||
29464 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
29465 if (SDValue Res = combineX86ShufflesRecursively(
29466 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
29467 DAG, DCI, Subtarget))
29470 // Attempt to constant fold all of the constant source ops.
29471 if (SDValue Cst = combineX86ShufflesConstants(
29472 Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
29475 // We can only combine unary and binary shuffle mask cases.
29476 if (Ops.size() > 2)
29479 // Minor canonicalization of the accumulated shuffle mask to make it easier
29480 // to match below. All this does is detect masks with sequential pairs of
29481 // elements, and shrink them to the half-width mask. It does this in a loop
29482 // so it will reduce the size of the mask to the minimal width mask which
29483 // performs an equivalent shuffle.
29484 SmallVector<int, 64> WidenedMask;
29485 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
29486 Mask = std::move(WidenedMask);
29489 // Canonicalization of binary shuffle masks to improve pattern matching by
29490 // commuting the inputs.
29491 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
29492 ShuffleVectorSDNode::commuteMask(Mask);
29493 std::swap(Ops[0], Ops[1]);
29496 // Finally, try to combine into a single shuffle instruction.
29497 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
29501 /// \brief Get the PSHUF-style mask from PSHUF node.
29503 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
29504 /// PSHUF-style masks that can be reused with such instructions.
29505 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
29506 MVT VT = N.getSimpleValueType();
29507 SmallVector<int, 4> Mask;
29508 SmallVector<SDValue, 2> Ops;
29511 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
29515 // If we have more than 128-bits, only the low 128-bits of shuffle mask
29516 // matter. Check that the upper masks are repeats and remove them.
29517 if (VT.getSizeInBits() > 128) {
29518 int LaneElts = 128 / VT.getScalarSizeInBits();
29520 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
29521 for (int j = 0; j < LaneElts; ++j)
29522 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
29523 "Mask doesn't repeat in high 128-bit lanes!");
29525 Mask.resize(LaneElts);
29528 switch (N.getOpcode()) {
29529 case X86ISD::PSHUFD:
29531 case X86ISD::PSHUFLW:
29534 case X86ISD::PSHUFHW:
29535 Mask.erase(Mask.begin(), Mask.begin() + 4);
29536 for (int &M : Mask)
29540 llvm_unreachable("No valid shuffle instruction found!");
29544 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
29546 /// We walk up the chain and look for a combinable shuffle, skipping over
29547 /// shuffles that we could hoist this shuffle's transformation past without
29548 /// altering anything.
29550 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
29551 SelectionDAG &DAG) {
29552 assert(N.getOpcode() == X86ISD::PSHUFD &&
29553 "Called with something other than an x86 128-bit half shuffle!");
29556 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
29557 // of the shuffles in the chain so that we can form a fresh chain to replace
29559 SmallVector<SDValue, 8> Chain;
29560 SDValue V = N.getOperand(0);
29561 for (; V.hasOneUse(); V = V.getOperand(0)) {
29562 switch (V.getOpcode()) {
29564 return SDValue(); // Nothing combined!
29567 // Skip bitcasts as we always know the type for the target specific
29571 case X86ISD::PSHUFD:
29572 // Found another dword shuffle.
29575 case X86ISD::PSHUFLW:
29576 // Check that the low words (being shuffled) are the identity in the
29577 // dword shuffle, and the high words are self-contained.
29578 if (Mask[0] != 0 || Mask[1] != 1 ||
29579 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
29582 Chain.push_back(V);
29585 case X86ISD::PSHUFHW:
29586 // Check that the high words (being shuffled) are the identity in the
29587 // dword shuffle, and the low words are self-contained.
29588 if (Mask[2] != 2 || Mask[3] != 3 ||
29589 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
29592 Chain.push_back(V);
29595 case X86ISD::UNPCKL:
29596 case X86ISD::UNPCKH:
29597 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
29598 // shuffle into a preceding word shuffle.
29599 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
29600 V.getSimpleValueType().getVectorElementType() != MVT::i16)
29603 // Search for a half-shuffle which we can combine with.
29604 unsigned CombineOp =
29605 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
29606 if (V.getOperand(0) != V.getOperand(1) ||
29607 !V->isOnlyUserOf(V.getOperand(0).getNode()))
29609 Chain.push_back(V);
29610 V = V.getOperand(0);
29612 switch (V.getOpcode()) {
29614 return SDValue(); // Nothing to combine.
29616 case X86ISD::PSHUFLW:
29617 case X86ISD::PSHUFHW:
29618 if (V.getOpcode() == CombineOp)
29621 Chain.push_back(V);
29625 V = V.getOperand(0);
29629 } while (V.hasOneUse());
29632 // Break out of the loop if we break out of the switch.
29636 if (!V.hasOneUse())
29637 // We fell out of the loop without finding a viable combining instruction.
29640 // Merge this node's mask and our incoming mask.
29641 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29642 for (int &M : Mask)
29644 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
29645 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29647 // Rebuild the chain around this new shuffle.
29648 while (!Chain.empty()) {
29649 SDValue W = Chain.pop_back_val();
29651 if (V.getValueType() != W.getOperand(0).getValueType())
29652 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
29654 switch (W.getOpcode()) {
29656 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
29658 case X86ISD::UNPCKL:
29659 case X86ISD::UNPCKH:
29660 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
29663 case X86ISD::PSHUFD:
29664 case X86ISD::PSHUFLW:
29665 case X86ISD::PSHUFHW:
29666 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
29670 if (V.getValueType() != N.getValueType())
29671 V = DAG.getBitcast(N.getValueType(), V);
29673 // Return the new chain to replace N.
29677 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
29680 /// We walk up the chain, skipping shuffles of the other half and looking
29681 /// through shuffles which switch halves trying to find a shuffle of the same
29682 /// pair of dwords.
29683 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
29685 TargetLowering::DAGCombinerInfo &DCI) {
29687 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
29688 "Called with something other than an x86 128-bit half shuffle!");
29690 unsigned CombineOpcode = N.getOpcode();
29692 // Walk up a single-use chain looking for a combinable shuffle.
29693 SDValue V = N.getOperand(0);
29694 for (; V.hasOneUse(); V = V.getOperand(0)) {
29695 switch (V.getOpcode()) {
29697 return false; // Nothing combined!
29700 // Skip bitcasts as we always know the type for the target specific
29704 case X86ISD::PSHUFLW:
29705 case X86ISD::PSHUFHW:
29706 if (V.getOpcode() == CombineOpcode)
29709 // Other-half shuffles are no-ops.
29712 // Break out of the loop if we break out of the switch.
29716 if (!V.hasOneUse())
29717 // We fell out of the loop without finding a viable combining instruction.
29720 // Combine away the bottom node as its shuffle will be accumulated into
29721 // a preceding shuffle.
29722 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
29724 // Record the old value.
29727 // Merge this node's mask and our incoming mask (adjusted to account for all
29728 // the pshufd instructions encountered).
29729 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29730 for (int &M : Mask)
29732 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
29733 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29735 // Check that the shuffles didn't cancel each other out. If not, we need to
29736 // combine to the new one.
29738 // Replace the combinable shuffle with the combined one, updating all users
29739 // so that we re-evaluate the chain here.
29740 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
29745 /// \brief Try to combine x86 target specific shuffles.
29746 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
29747 TargetLowering::DAGCombinerInfo &DCI,
29748 const X86Subtarget &Subtarget) {
29750 MVT VT = N.getSimpleValueType();
29751 SmallVector<int, 4> Mask;
29752 unsigned Opcode = N.getOpcode();
29754 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
29755 // single instruction.
29756 if (VT.getScalarSizeInBits() == 64 &&
29757 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
29758 Opcode == X86ISD::UNPCKL)) {
29759 auto BC0 = peekThroughBitcasts(N.getOperand(0));
29760 auto BC1 = peekThroughBitcasts(N.getOperand(1));
29761 EVT VT0 = BC0.getValueType();
29762 EVT VT1 = BC1.getValueType();
29763 unsigned Opcode0 = BC0.getOpcode();
29764 unsigned Opcode1 = BC1.getOpcode();
29765 if (Opcode0 == Opcode1 && VT0 == VT1 &&
29766 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
29767 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
29768 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
29770 if (Opcode == X86ISD::MOVSD) {
29771 Lo = BC1.getOperand(0);
29772 Hi = BC0.getOperand(1);
29774 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
29775 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
29777 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
29778 DCI.AddToWorklist(Horiz.getNode());
29779 return DAG.getBitcast(VT, Horiz);
29784 case X86ISD::PSHUFD:
29785 case X86ISD::PSHUFLW:
29786 case X86ISD::PSHUFHW:
29787 Mask = getPSHUFShuffleMask(N);
29788 assert(Mask.size() == 4);
29790 case X86ISD::UNPCKL: {
29791 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
29792 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
29793 // moves upper half elements into the lower half part. For example:
29795 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
29797 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
29799 // will be combined to:
29801 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
29803 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
29804 // happen due to advanced instructions.
29805 if (!VT.is128BitVector())
29808 auto Op0 = N.getOperand(0);
29809 auto Op1 = N.getOperand(1);
29810 if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
29811 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
29813 unsigned NumElts = VT.getVectorNumElements();
29814 SmallVector<int, 8> ExpectedMask(NumElts, -1);
29815 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
29818 auto ShufOp = Op1.getOperand(0);
29819 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
29820 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
29824 case X86ISD::BLENDI: {
29825 SDValue V0 = N->getOperand(0);
29826 SDValue V1 = N->getOperand(1);
29827 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
29828 "Unexpected input vector types");
29830 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
29831 // operands and changing the mask to 1. This saves us a bunch of
29832 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
29833 // x86InstrInfo knows how to commute this back after instruction selection
29834 // if it would help register allocation.
29836 // TODO: If optimizing for size or a processor that doesn't suffer from
29837 // partial register update stalls, this should be transformed into a MOVSD
29838 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
29840 if (VT == MVT::v2f64)
29841 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
29842 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
29843 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
29844 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
29849 case X86ISD::MOVSD:
29850 case X86ISD::MOVSS: {
29851 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
29852 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
29853 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
29854 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
29855 if (isZero0 && isZero1)
29858 // We often lower to MOVSD/MOVSS from integer as well as native float
29859 // types; remove unnecessary domain-crossing bitcasts if we can to make it
29860 // easier to combine shuffles later on. We've already accounted for the
29861 // domain switching cost when we decided to lower with it.
29862 bool isFloat = VT.isFloatingPoint();
29863 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
29864 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
29865 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
29866 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
29867 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
29868 V0 = DAG.getBitcast(NewVT, V0);
29869 V1 = DAG.getBitcast(NewVT, V1);
29870 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
29875 case X86ISD::INSERTPS: {
29876 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
29877 SDValue Op0 = N.getOperand(0);
29878 SDValue Op1 = N.getOperand(1);
29879 SDValue Op2 = N.getOperand(2);
29880 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
29881 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
29882 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
29883 unsigned ZeroMask = InsertPSMask & 0xF;
29885 // If we zero out all elements from Op0 then we don't need to reference it.
29886 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
29887 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
29888 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29890 // If we zero out the element from Op1 then we don't need to reference it.
29891 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
29892 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
29893 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29895 // Attempt to merge insertps Op1 with an inner target shuffle node.
29896 SmallVector<int, 8> TargetMask1;
29897 SmallVector<SDValue, 2> Ops1;
29898 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
29899 int M = TargetMask1[SrcIdx];
29900 if (isUndefOrZero(M)) {
29901 // Zero/UNDEF insertion - zero out element and remove dependency.
29902 InsertPSMask |= (1u << DstIdx);
29903 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
29904 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29906 // Update insertps mask srcidx and reference the source input directly.
29907 assert(0 <= M && M < 8 && "Shuffle index out of range");
29908 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
29909 Op1 = Ops1[M < 4 ? 0 : 1];
29910 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
29911 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29914 // Attempt to merge insertps Op0 with an inner target shuffle node.
29915 SmallVector<int, 8> TargetMask0;
29916 SmallVector<SDValue, 2> Ops0;
29917 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
29920 bool Updated = false;
29921 bool UseInput00 = false;
29922 bool UseInput01 = false;
29923 for (int i = 0; i != 4; ++i) {
29924 int M = TargetMask0[i];
29925 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
29926 // No change if element is already zero or the inserted element.
29928 } else if (isUndefOrZero(M)) {
29929 // If the target mask is undef/zero then we must zero the element.
29930 InsertPSMask |= (1u << i);
29935 // The input vector element must be inline.
29936 if (M != i && M != (i + 4))
29939 // Determine which inputs of the target shuffle we're using.
29940 UseInput00 |= (0 <= M && M < 4);
29941 UseInput01 |= (4 <= M);
29944 // If we're not using both inputs of the target shuffle then use the
29945 // referenced input directly.
29946 if (UseInput00 && !UseInput01) {
29949 } else if (!UseInput00 && UseInput01) {
29955 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
29956 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29964 // Nuke no-op shuffles that show up after combining.
29965 if (isNoopShuffleMask(Mask))
29966 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
29968 // Look for simplifications involving one or two shuffle instructions.
29969 SDValue V = N.getOperand(0);
29970 switch (N.getOpcode()) {
29973 case X86ISD::PSHUFLW:
29974 case X86ISD::PSHUFHW:
29975 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
29977 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
29978 return SDValue(); // We combined away this shuffle, so we're done.
29980 // See if this reduces to a PSHUFD which is no more expensive and can
29981 // combine with more operations. Note that it has to at least flip the
29982 // dwords as otherwise it would have been removed as a no-op.
29983 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
29984 int DMask[] = {0, 1, 2, 3};
29985 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
29986 DMask[DOffset + 0] = DOffset + 1;
29987 DMask[DOffset + 1] = DOffset + 0;
29988 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
29989 V = DAG.getBitcast(DVT, V);
29990 DCI.AddToWorklist(V.getNode());
29991 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
29992 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
29993 DCI.AddToWorklist(V.getNode());
29994 return DAG.getBitcast(VT, V);
29997 // Look for shuffle patterns which can be implemented as a single unpack.
29998 // FIXME: This doesn't handle the location of the PSHUFD generically, and
29999 // only works when we have a PSHUFD followed by two half-shuffles.
30000 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
30001 (V.getOpcode() == X86ISD::PSHUFLW ||
30002 V.getOpcode() == X86ISD::PSHUFHW) &&
30003 V.getOpcode() != N.getOpcode() &&
30005 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
30006 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
30007 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30008 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
30009 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30010 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30012 for (int i = 0; i < 4; ++i) {
30013 WordMask[i + NOffset] = Mask[i] + NOffset;
30014 WordMask[i + VOffset] = VMask[i] + VOffset;
30016 // Map the word mask through the DWord mask.
30018 for (int i = 0; i < 8; ++i)
30019 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
30020 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
30021 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
30022 // We can replace all three shuffles with an unpack.
30023 V = DAG.getBitcast(VT, D.getOperand(0));
30024 DCI.AddToWorklist(V.getNode());
30025 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
30034 case X86ISD::PSHUFD:
30035 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
30044 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
30045 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
30046 /// are written to the parameters \p Opnd0 and \p Opnd1.
30048 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
30049 /// so it is easier to generically match. We also insert dummy vector shuffle
30050 /// nodes for the operands which explicitly discard the lanes which are unused
30051 /// by this operation to try to flow through the rest of the combiner
30052 /// the fact that they're unused.
30053 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
30054 SDValue &Opnd0, SDValue &Opnd1,
30055 bool matchSubAdd = false) {
30057 EVT VT = N->getValueType(0);
30058 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
30059 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
30060 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
30063 // We only handle target-independent shuffles.
30064 // FIXME: It would be easy and harmless to use the target shuffle mask
30065 // extraction tool to support more.
30066 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
30069 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
30070 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
30072 SDValue V1 = N->getOperand(0);
30073 SDValue V2 = N->getOperand(1);
30075 unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB;
30076 unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD;
30078 // We require the first shuffle operand to be the ExpectedOpcode node,
30079 // and the second to be the NextExpectedOpcode node.
30080 if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) {
30081 ShuffleVectorSDNode::commuteMask(Mask);
30083 } else if (V1.getOpcode() != ExpectedOpcode || V2.getOpcode() != NextExpectedOpcode)
30086 // If there are other uses of these operations we can't fold them.
30087 if (!V1->hasOneUse() || !V2->hasOneUse())
30090 // Ensure that both operations have the same operands. Note that we can
30091 // commute the FADD operands.
30092 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
30093 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
30094 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
30097 // We're looking for blends between FADD and FSUB nodes. We insist on these
30098 // nodes being lined up in a specific expected pattern.
30099 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
30100 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
30101 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
30102 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
30103 8, 25, 10, 27, 12, 29, 14, 31})))
30111 /// \brief Try to combine a shuffle into a target-specific add-sub or
30112 /// mul-add-sub node.
30113 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
30114 const X86Subtarget &Subtarget,
30115 SelectionDAG &DAG) {
30116 SDValue Opnd0, Opnd1;
30117 if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1))
30120 EVT VT = N->getValueType(0);
30123 // Try to generate X86ISD::FMADDSUB node here.
30125 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
30126 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
30128 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
30129 // the ADDSUB idiom has been successfully recognized. There are no known
30130 // X86 targets with 512-bit ADDSUB instructions!
30131 if (VT.is512BitVector())
30134 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
30137 /// \brief Try to combine a shuffle into a target-specific
30138 /// mul-sub-add node.
30139 static SDValue combineShuffleToFMSubAdd(SDNode *N,
30140 const X86Subtarget &Subtarget,
30141 SelectionDAG &DAG) {
30142 SDValue Opnd0, Opnd1;
30143 if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true))
30146 EVT VT = N->getValueType(0);
30149 // Try to generate X86ISD::FMSUBADD node here.
30151 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
30152 return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2);
30157 // We are looking for a shuffle where both sources are concatenated with undef
30158 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
30159 // if we can express this as a single-source shuffle, that's preferable.
30160 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
30161 const X86Subtarget &Subtarget) {
30162 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
30165 EVT VT = N->getValueType(0);
30167 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
30168 if (!VT.is128BitVector() && !VT.is256BitVector())
30171 if (VT.getVectorElementType() != MVT::i32 &&
30172 VT.getVectorElementType() != MVT::i64 &&
30173 VT.getVectorElementType() != MVT::f32 &&
30174 VT.getVectorElementType() != MVT::f64)
30177 SDValue N0 = N->getOperand(0);
30178 SDValue N1 = N->getOperand(1);
30180 // Check that both sources are concats with undef.
30181 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
30182 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
30183 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
30184 !N1.getOperand(1).isUndef())
30187 // Construct the new shuffle mask. Elements from the first source retain their
30188 // index, but elements from the second source no longer need to skip an undef.
30189 SmallVector<int, 8> Mask;
30190 int NumElts = VT.getVectorNumElements();
30192 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
30193 for (int Elt : SVOp->getMask())
30194 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
30197 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
30199 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
30202 /// Eliminate a redundant shuffle of a horizontal math op.
30203 static SDValue foldShuffleOfHorizOp(SDNode *N) {
30204 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
30207 SDValue HOp = N->getOperand(0);
30208 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
30209 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
30212 // 128-bit horizontal math instructions are defined to operate on adjacent
30213 // lanes of each operand as:
30214 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
30215 // ...similarly for v2f64 and v8i16.
30216 // TODO: 256-bit is not the same because...x86.
30217 if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
30220 // When the operands of a horizontal math op are identical, the low half of
30221 // the result is the same as the high half. If the shuffle is also replicating
30222 // low and high halves, we don't need the shuffle.
30223 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
30224 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
30225 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
30226 // but this should be tied to whatever horizontal op matching and shuffle
30227 // canonicalization are producing.
30228 if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
30229 isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
30230 isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
30236 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
30237 TargetLowering::DAGCombinerInfo &DCI,
30238 const X86Subtarget &Subtarget) {
30240 EVT VT = N->getValueType(0);
30241 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30242 // If we have legalized the vector types, look for blends of FADD and FSUB
30243 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
30244 if (TLI.isTypeLegal(VT)) {
30245 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
30248 if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG))
30251 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
30255 // During Type Legalization, when promoting illegal vector types,
30256 // the backend might introduce new shuffle dag nodes and bitcasts.
30258 // This code performs the following transformation:
30259 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
30260 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
30262 // We do this only if both the bitcast and the BINOP dag nodes have
30263 // one use. Also, perform this transformation only if the new binary
30264 // operation is legal. This is to avoid introducing dag nodes that
30265 // potentially need to be further expanded (or custom lowered) into a
30266 // less optimal sequence of dag nodes.
30267 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
30268 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
30269 N->getOperand(0).getOpcode() == ISD::BITCAST &&
30270 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
30271 SDValue N0 = N->getOperand(0);
30272 SDValue N1 = N->getOperand(1);
30274 SDValue BC0 = N0.getOperand(0);
30275 EVT SVT = BC0.getValueType();
30276 unsigned Opcode = BC0.getOpcode();
30277 unsigned NumElts = VT.getVectorNumElements();
30279 if (BC0.hasOneUse() && SVT.isVector() &&
30280 SVT.getVectorNumElements() * 2 == NumElts &&
30281 TLI.isOperationLegal(Opcode, VT)) {
30282 bool CanFold = false;
30288 // isOperationLegal lies for integer ops on floating point types.
30289 CanFold = VT.isInteger();
30294 // isOperationLegal lies for floating point ops on integer types.
30295 CanFold = VT.isFloatingPoint();
30299 unsigned SVTNumElts = SVT.getVectorNumElements();
30300 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
30301 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
30302 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
30303 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
30304 CanFold = SVOp->getMaskElt(i) < 0;
30307 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
30308 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
30309 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
30310 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
30315 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
30316 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
30317 // consecutive, non-overlapping, and in the right order.
30318 SmallVector<SDValue, 16> Elts;
30319 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
30320 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
30321 Elts.push_back(Elt);
30328 if (Elts.size() == VT.getVectorNumElements())
30330 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
30333 // For AVX2, we sometimes want to combine
30334 // (vector_shuffle <mask> (concat_vectors t1, undef)
30335 // (concat_vectors t2, undef))
30337 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
30338 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
30339 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
30342 if (isTargetShuffle(N->getOpcode())) {
30344 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
30347 // Try recursively combining arbitrary sequences of x86 shuffle
30348 // instructions into higher-order shuffles. We do this after combining
30349 // specific PSHUF instruction sequences into their minimal form so that we
30350 // can evaluate how many specialized shuffle instructions are involved in
30351 // a particular chain.
30352 if (SDValue Res = combineX86ShufflesRecursively(
30353 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
30354 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
30355 DCI.CombineTo(N, Res);
30363 /// Check if a vector extract from a target-specific shuffle of a load can be
30364 /// folded into a single element load.
30365 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
30366 /// shuffles have been custom lowered so we need to handle those here.
30367 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
30368 TargetLowering::DAGCombinerInfo &DCI) {
30369 if (DCI.isBeforeLegalizeOps())
30372 SDValue InVec = N->getOperand(0);
30373 SDValue EltNo = N->getOperand(1);
30374 EVT EltVT = N->getValueType(0);
30376 if (!isa<ConstantSDNode>(EltNo))
30379 EVT OriginalVT = InVec.getValueType();
30381 // Peek through bitcasts, don't duplicate a load with other uses.
30382 InVec = peekThroughOneUseBitcasts(InVec);
30384 EVT CurrentVT = InVec.getValueType();
30385 if (!CurrentVT.isVector() ||
30386 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
30389 if (!isTargetShuffle(InVec.getOpcode()))
30392 // Don't duplicate a load with other uses.
30393 if (!InVec.hasOneUse())
30396 SmallVector<int, 16> ShuffleMask;
30397 SmallVector<SDValue, 2> ShuffleOps;
30399 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
30400 ShuffleOps, ShuffleMask, UnaryShuffle))
30403 // Select the input vector, guarding against out of range extract vector.
30404 unsigned NumElems = CurrentVT.getVectorNumElements();
30405 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
30406 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
30408 if (Idx == SM_SentinelZero)
30409 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
30410 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
30411 if (Idx == SM_SentinelUndef)
30412 return DAG.getUNDEF(EltVT);
30414 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
30415 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
30418 // If inputs to shuffle are the same for both ops, then allow 2 uses
30419 unsigned AllowedUses =
30420 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
30422 if (LdNode.getOpcode() == ISD::BITCAST) {
30423 // Don't duplicate a load with other uses.
30424 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
30427 AllowedUses = 1; // only allow 1 load use if we have a bitcast
30428 LdNode = LdNode.getOperand(0);
30431 if (!ISD::isNormalLoad(LdNode.getNode()))
30434 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
30436 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
30439 // If there's a bitcast before the shuffle, check if the load type and
30440 // alignment is valid.
30441 unsigned Align = LN0->getAlignment();
30442 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30443 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
30444 EltVT.getTypeForEVT(*DAG.getContext()));
30446 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
30449 // All checks match so transform back to vector_shuffle so that DAG combiner
30450 // can finish the job
30453 // Create shuffle node taking into account the case that its a unary shuffle
30454 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
30455 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
30457 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
30458 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
30462 // Try to match patterns such as
30463 // (i16 bitcast (v16i1 x))
30465 // (i16 movmsk (16i8 sext (v16i1 x)))
30466 // before the illegal vector is scalarized on subtargets that don't have legal
30468 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
30469 const X86Subtarget &Subtarget) {
30470 EVT VT = BitCast.getValueType();
30471 SDValue N0 = BitCast.getOperand(0);
30472 EVT VecVT = N0->getValueType(0);
30474 if (!VT.isScalarInteger() || !VecVT.isSimple())
30477 // With AVX512 vxi1 types are legal and we prefer using k-regs.
30478 // MOVMSK is supported in SSE2 or later.
30479 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
30482 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
30483 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
30484 // v8i16 and v16i16.
30485 // For these two cases, we can shuffle the upper element bytes to a
30486 // consecutive sequence at the start of the vector and treat the results as
30487 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
30488 // for v16i16 this is not the case, because the shuffle is expensive, so we
30489 // avoid sign-extending to this type entirely.
30490 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
30491 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
30493 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
30494 switch (VecVT.getSimpleVT().SimpleTy) {
30498 SExtVT = MVT::v2i64;
30499 FPCastVT = MVT::v2f64;
30502 SExtVT = MVT::v4i32;
30503 FPCastVT = MVT::v4f32;
30504 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
30505 // sign-extend to a 256-bit operation to avoid truncation.
30506 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30507 N0->getOperand(0).getValueType().is256BitVector()) {
30508 SExtVT = MVT::v4i64;
30509 FPCastVT = MVT::v4f64;
30513 SExtVT = MVT::v8i16;
30514 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
30515 // sign-extend to a 256-bit operation to match the compare.
30516 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
30517 // 256-bit because the shuffle is cheaper than sign extending the result of
30519 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30520 (N0->getOperand(0).getValueType().is256BitVector() ||
30521 N0->getOperand(0).getValueType().is512BitVector())) {
30522 SExtVT = MVT::v8i32;
30523 FPCastVT = MVT::v8f32;
30527 SExtVT = MVT::v16i8;
30528 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
30529 // it is not profitable to sign-extend to 256-bit because this will
30530 // require an extra cross-lane shuffle which is more expensive than
30531 // truncating the result of the compare to 128-bits.
30534 SExtVT = MVT::v32i8;
30539 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
30541 if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30542 // Handle pre-AVX2 cases by splitting to two v16i1's.
30543 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30544 MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
30545 SDValue Lo = extract128BitVector(V, 0, DAG, DL);
30546 SDValue Hi = extract128BitVector(V, 16, DAG, DL);
30547 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30548 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30549 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30550 DAG.getConstant(16, DL, ShiftTy));
30551 V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30552 return DAG.getZExtOrTrunc(V, DL, VT);
30555 if (SExtVT == MVT::v8i16) {
30556 assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
30557 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
30558 DAG.getUNDEF(MVT::v8i16));
30560 assert(SExtVT.getScalarType() != MVT::i16 &&
30561 "Vectors of i16 must be packed");
30562 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
30563 V = DAG.getBitcast(FPCastVT, V);
30564 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30565 return DAG.getZExtOrTrunc(V, DL, VT);
30568 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
30569 TargetLowering::DAGCombinerInfo &DCI,
30570 const X86Subtarget &Subtarget) {
30571 SDValue N0 = N->getOperand(0);
30572 EVT VT = N->getValueType(0);
30573 EVT SrcVT = N0.getValueType();
30575 // Try to match patterns such as
30576 // (i16 bitcast (v16i1 x))
30578 // (i16 movmsk (16i8 sext (v16i1 x)))
30579 // before the setcc result is scalarized on subtargets that don't have legal
30581 if (DCI.isBeforeLegalize()) {
30582 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
30585 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
30586 // type, widen both sides to avoid a trip through memory.
30587 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
30588 Subtarget.hasVLX()) {
30590 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
30591 N0 = DAG.getBitcast(MVT::v8i1, N0);
30592 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
30593 DAG.getIntPtrConstant(0, dl));
30596 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
30597 // type, widen both sides to avoid a trip through memory.
30598 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
30599 Subtarget.hasVLX()) {
30601 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
30602 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
30604 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
30605 N0 = DAG.getBitcast(MVT::i8, N0);
30606 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
30610 // Since MMX types are special and don't usually play with other vector types,
30611 // it's better to handle them early to be sure we emit efficient code by
30612 // avoiding store-load conversions.
30614 // Detect bitcasts between i32 to x86mmx low word.
30615 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
30616 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
30617 SDValue N00 = N0->getOperand(0);
30618 if (N00.getValueType() == MVT::i32)
30619 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
30622 // Detect bitcasts between element or subvector extraction to x86mmx.
30623 if (VT == MVT::x86mmx &&
30624 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
30625 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
30626 isNullConstant(N0.getOperand(1))) {
30627 SDValue N00 = N0->getOperand(0);
30628 if (N00.getValueType().is128BitVector())
30629 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
30630 DAG.getBitcast(MVT::v2i64, N00));
30633 // Detect bitcasts from FP_TO_SINT to x86mmx.
30634 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
30635 N0.getOpcode() == ISD::FP_TO_SINT) {
30637 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
30638 DAG.getUNDEF(MVT::v2i32));
30639 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
30640 DAG.getBitcast(MVT::v2i64, Res));
30643 // Convert a bitcasted integer logic operation that has one bitcasted
30644 // floating-point operand into a floating-point logic operation. This may
30645 // create a load of a constant, but that is cheaper than materializing the
30646 // constant in an integer register and transferring it to an SSE register or
30647 // transferring the SSE operand to integer register and back.
30649 switch (N0.getOpcode()) {
30650 case ISD::AND: FPOpcode = X86ISD::FAND; break;
30651 case ISD::OR: FPOpcode = X86ISD::FOR; break;
30652 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
30653 default: return SDValue();
30656 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
30657 (Subtarget.hasSSE2() && VT == MVT::f64)))
30660 SDValue LogicOp0 = N0.getOperand(0);
30661 SDValue LogicOp1 = N0.getOperand(1);
30664 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
30665 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
30666 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
30667 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
30668 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
30669 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
30671 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
30672 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
30673 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
30674 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
30675 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
30676 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
30682 // Match a binop + shuffle pyramid that represents a horizontal reduction over
30683 // the elements of a vector.
30684 // Returns the vector that is being reduced on, or SDValue() if a reduction
30685 // was not matched.
30686 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
30687 ArrayRef<ISD::NodeType> CandidateBinOps) {
30688 // The pattern must end in an extract from index 0.
30689 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
30690 !isNullConstant(Extract->getOperand(1)))
30693 SDValue Op = Extract->getOperand(0);
30694 unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
30696 // Match against one of the candidate binary ops.
30697 if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
30698 return Op.getOpcode() == unsigned(BinOp);
30702 // At each stage, we're looking for something that looks like:
30703 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
30704 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
30705 // i32 undef, i32 undef, i32 undef, i32 undef>
30706 // %a = binop <8 x i32> %op, %s
30707 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
30708 // we expect something like:
30709 // <4,5,6,7,u,u,u,u>
30710 // <2,3,u,u,u,u,u,u>
30711 // <1,u,u,u,u,u,u,u>
30712 unsigned CandidateBinOp = Op.getOpcode();
30713 for (unsigned i = 0; i < Stages; ++i) {
30714 if (Op.getOpcode() != CandidateBinOp)
30717 ShuffleVectorSDNode *Shuffle =
30718 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
30720 Op = Op.getOperand(1);
30722 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
30723 Op = Op.getOperand(0);
30726 // The first operand of the shuffle should be the same as the other operand
30728 if (!Shuffle || Shuffle->getOperand(0) != Op)
30731 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
30732 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
30733 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
30737 BinOp = CandidateBinOp;
30741 // Given a select, detect the following pattern:
30742 // 1: %2 = zext <N x i8> %0 to <N x i32>
30743 // 2: %3 = zext <N x i8> %1 to <N x i32>
30744 // 3: %4 = sub nsw <N x i32> %2, %3
30745 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
30746 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
30747 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
30748 // This is useful as it is the input into a SAD pattern.
30749 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
30751 // Check the condition of the select instruction is greater-than.
30752 SDValue SetCC = Select->getOperand(0);
30753 if (SetCC.getOpcode() != ISD::SETCC)
30755 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
30756 if (CC != ISD::SETGT && CC != ISD::SETLT)
30759 SDValue SelectOp1 = Select->getOperand(1);
30760 SDValue SelectOp2 = Select->getOperand(2);
30762 // The following instructions assume SelectOp1 is the subtraction operand
30763 // and SelectOp2 is the negation operand.
30764 // In the case of SETLT this is the other way around.
30765 if (CC == ISD::SETLT)
30766 std::swap(SelectOp1, SelectOp2);
30768 // The second operand of the select should be the negation of the first
30769 // operand, which is implemented as 0 - SelectOp1.
30770 if (!(SelectOp2.getOpcode() == ISD::SUB &&
30771 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
30772 SelectOp2.getOperand(1) == SelectOp1))
30775 // The first operand of SetCC is the first operand of the select, which is the
30776 // difference between the two input vectors.
30777 if (SetCC.getOperand(0) != SelectOp1)
30780 // In SetLT case, The second operand of the comparison can be either 1 or 0.
30782 if ((CC == ISD::SETLT) &&
30783 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
30784 SplatVal.isOneValue()) ||
30785 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
30788 // In SetGT case, The second operand of the comparison can be either -1 or 0.
30789 if ((CC == ISD::SETGT) &&
30790 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
30791 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
30794 // The first operand of the select is the difference between the two input
30796 if (SelectOp1.getOpcode() != ISD::SUB)
30799 Op0 = SelectOp1.getOperand(0);
30800 Op1 = SelectOp1.getOperand(1);
30802 // Check if the operands of the sub are zero-extended from vectors of i8.
30803 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
30804 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
30805 Op1.getOpcode() != ISD::ZERO_EXTEND ||
30806 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
30812 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
30814 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
30815 const SDValue &Zext1, const SDLoc &DL) {
30817 // Find the appropriate width for the PSADBW.
30818 EVT InVT = Zext0.getOperand(0).getValueType();
30819 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
30821 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
30822 // fill in the missing vector elements with 0.
30823 unsigned NumConcat = RegSize / InVT.getSizeInBits();
30824 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
30825 Ops[0] = Zext0.getOperand(0);
30826 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
30827 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30828 Ops[0] = Zext1.getOperand(0);
30829 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30831 // Actually build the SAD
30832 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
30833 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
30836 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
30838 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
30839 const X86Subtarget &Subtarget) {
30840 // Bail without SSE41.
30841 if (!Subtarget.hasSSE41())
30844 EVT ExtractVT = Extract->getValueType(0);
30845 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
30848 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
30850 SDValue Src = matchBinOpReduction(
30851 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
30855 EVT SrcVT = Src.getValueType();
30856 EVT SrcSVT = SrcVT.getScalarType();
30857 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
30861 SDValue MinPos = Src;
30863 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
30864 while (SrcVT.getSizeInBits() > 128) {
30865 unsigned NumElts = SrcVT.getVectorNumElements();
30866 unsigned NumSubElts = NumElts / 2;
30867 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
30868 unsigned SubSizeInBits = SrcVT.getSizeInBits();
30869 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
30870 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
30871 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
30873 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
30874 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
30875 "Unexpected value type");
30877 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
30878 // to flip the value accordingly.
30880 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
30881 if (BinOp == ISD::SMAX)
30882 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
30883 else if (BinOp == ISD::SMIN)
30884 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
30885 else if (BinOp == ISD::UMAX)
30886 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
30889 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
30891 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
30892 // shuffling each upper element down and insert zeros. This means that the
30893 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
30894 // ready for the PHMINPOS.
30895 if (ExtractVT == MVT::i8) {
30896 SDValue Upper = DAG.getVectorShuffle(
30897 SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
30898 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
30899 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
30902 // Perform the PHMINPOS on a v8i16 vector,
30903 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
30904 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
30905 MinPos = DAG.getBitcast(SrcVT, MinPos);
30908 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
30910 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
30911 DAG.getIntPtrConstant(0, DL));
30914 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
30915 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
30917 const X86Subtarget &Subtarget) {
30918 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
30919 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
30922 EVT ExtractVT = Extract->getValueType(0);
30923 unsigned BitWidth = ExtractVT.getSizeInBits();
30924 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
30925 ExtractVT != MVT::i8)
30928 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
30929 unsigned BinOp = 0;
30930 SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
30934 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
30935 // which we can't support here for now.
30936 if (Match.getScalarValueSizeInBits() != BitWidth)
30939 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
30940 unsigned MatchSizeInBits = Match.getValueSizeInBits();
30941 if (!(MatchSizeInBits == 128 ||
30942 (MatchSizeInBits == 256 &&
30943 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
30946 // Don't bother performing this for 2-element vectors.
30947 if (Match.getValueType().getVectorNumElements() <= 2)
30950 // Check that we are extracting a reduction of all sign bits.
30951 if (DAG.ComputeNumSignBits(Match) != BitWidth)
30954 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
30956 if (64 == BitWidth || 32 == BitWidth)
30957 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
30958 MatchSizeInBits / BitWidth);
30960 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
30963 ISD::CondCode CondCode;
30964 if (BinOp == ISD::OR) {
30965 // any_of -> MOVMSK != 0
30966 CompareBits = APInt::getNullValue(32);
30967 CondCode = ISD::CondCode::SETNE;
30969 // all_of -> MOVMSK == ((1 << NumElts) - 1)
30970 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
30971 CondCode = ISD::CondCode::SETEQ;
30974 // Perform the select as i32/i64 and then truncate to avoid partial register
30976 unsigned ResWidth = std::max(BitWidth, 32u);
30977 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
30979 SDValue Zero = DAG.getConstant(0, DL, ResVT);
30980 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
30981 SDValue Res = DAG.getBitcast(MaskVT, Match);
30982 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
30983 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
30984 Ones, Zero, CondCode);
30985 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
30988 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
30989 const X86Subtarget &Subtarget) {
30990 // PSADBW is only supported on SSE2 and up.
30991 if (!Subtarget.hasSSE2())
30994 // Verify the type we're extracting from is any integer type above i16.
30995 EVT VT = Extract->getOperand(0).getValueType();
30996 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
30999 unsigned RegSize = 128;
31000 if (Subtarget.hasBWI())
31002 else if (Subtarget.hasAVX2())
31005 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
31006 // TODO: We should be able to handle larger vectors by splitting them before
31007 // feeding them into several SADs, and then reducing over those.
31008 if (RegSize / VT.getVectorNumElements() < 8)
31011 // Match shuffle + add pyramid.
31012 unsigned BinOp = 0;
31013 SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
31015 // The operand is expected to be zero extended from i8
31016 // (verified in detectZextAbsDiff).
31017 // In order to convert to i64 and above, additional any/zero/sign
31018 // extend is expected.
31019 // The zero extend from 32 bit has no mathematical effect on the result.
31020 // Also the sign extend is basically zero extend
31021 // (extends the sign bit which is zero).
31022 // So it is correct to skip the sign/zero extend instruction.
31023 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
31024 Root.getOpcode() == ISD::ZERO_EXTEND ||
31025 Root.getOpcode() == ISD::ANY_EXTEND))
31026 Root = Root.getOperand(0);
31028 // If there was a match, we want Root to be a select that is the root of an
31029 // abs-diff pattern.
31030 if (!Root || (Root.getOpcode() != ISD::VSELECT))
31033 // Check whether we have an abs-diff pattern feeding into the select.
31034 SDValue Zext0, Zext1;
31035 if (!detectZextAbsDiff(Root, Zext0, Zext1))
31038 // Create the SAD instruction.
31040 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
31042 // If the original vector was wider than 8 elements, sum over the results
31043 // in the SAD vector.
31044 unsigned Stages = Log2_32(VT.getVectorNumElements());
31045 MVT SadVT = SAD.getSimpleValueType();
31047 unsigned SadElems = SadVT.getVectorNumElements();
31049 for(unsigned i = Stages - 3; i > 0; --i) {
31050 SmallVector<int, 16> Mask(SadElems, -1);
31051 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
31052 Mask[j] = MaskEnd + j;
31055 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
31056 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
31060 MVT Type = Extract->getSimpleValueType(0);
31061 unsigned TypeSizeInBits = Type.getSizeInBits();
31062 // Return the lowest TypeSizeInBits bits.
31063 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
31064 SAD = DAG.getBitcast(ResVT, SAD);
31065 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
31066 Extract->getOperand(1));
31069 // Attempt to peek through a target shuffle and extract the scalar from the
31071 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
31072 TargetLowering::DAGCombinerInfo &DCI,
31073 const X86Subtarget &Subtarget) {
31074 if (DCI.isBeforeLegalizeOps())
31077 SDValue Src = N->getOperand(0);
31078 SDValue Idx = N->getOperand(1);
31080 EVT VT = N->getValueType(0);
31081 EVT SrcVT = Src.getValueType();
31082 EVT SrcSVT = SrcVT.getVectorElementType();
31083 unsigned NumSrcElts = SrcVT.getVectorNumElements();
31085 // Don't attempt this for boolean mask vectors or unknown extraction indices.
31086 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
31089 // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
31090 if (X86ISD::VBROADCAST == Src.getOpcode() &&
31091 Src.getOperand(0).getValueType() == VT)
31092 return Src.getOperand(0);
31094 // Resolve the target shuffle inputs and mask.
31095 SmallVector<int, 16> Mask;
31096 SmallVector<SDValue, 2> Ops;
31097 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
31100 // Attempt to narrow/widen the shuffle mask to the correct size.
31101 if (Mask.size() != NumSrcElts) {
31102 if ((NumSrcElts % Mask.size()) == 0) {
31103 SmallVector<int, 16> ScaledMask;
31104 int Scale = NumSrcElts / Mask.size();
31105 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
31106 Mask = std::move(ScaledMask);
31107 } else if ((Mask.size() % NumSrcElts) == 0) {
31108 SmallVector<int, 16> WidenedMask;
31109 while (Mask.size() > NumSrcElts &&
31110 canWidenShuffleElements(Mask, WidenedMask))
31111 Mask = std::move(WidenedMask);
31112 // TODO - investigate support for wider shuffle masks with known upper
31113 // undef/zero elements for implicit zero-extension.
31117 // Check if narrowing/widening failed.
31118 if (Mask.size() != NumSrcElts)
31121 int SrcIdx = Mask[N->getConstantOperandVal(1)];
31124 // If the shuffle source element is undef/zero then we can just accept it.
31125 if (SrcIdx == SM_SentinelUndef)
31126 return DAG.getUNDEF(VT);
31128 if (SrcIdx == SM_SentinelZero)
31129 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
31130 : DAG.getConstant(0, dl, VT);
31132 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
31133 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
31134 SrcIdx = SrcIdx % Mask.size();
31136 // We can only extract other elements from 128-bit vectors and in certain
31137 // circumstances, depending on SSE-level.
31138 // TODO: Investigate using extract_subvector for larger vectors.
31139 // TODO: Investigate float/double extraction if it will be just stored.
31140 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
31141 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
31142 assert(SrcSVT == VT && "Unexpected extraction type");
31143 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
31144 DAG.getIntPtrConstant(SrcIdx, dl));
31147 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
31148 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
31149 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
31150 "Unexpected extraction type");
31151 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
31152 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
31153 DAG.getIntPtrConstant(SrcIdx, dl));
31154 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
31160 /// Detect vector gather/scatter index generation and convert it from being a
31161 /// bunch of shuffles and extracts into a somewhat faster sequence.
31162 /// For i686, the best sequence is apparently storing the value and loading
31163 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
31164 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
31165 TargetLowering::DAGCombinerInfo &DCI,
31166 const X86Subtarget &Subtarget) {
31167 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
31170 // TODO - Remove this once we can handle the implicit zero-extension of
31171 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
31172 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
31173 // combineBasicSADPattern.
31174 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
31177 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
31180 SDValue InputVector = N->getOperand(0);
31181 SDValue EltIdx = N->getOperand(1);
31183 EVT SrcVT = InputVector.getValueType();
31184 EVT VT = N->getValueType(0);
31185 SDLoc dl(InputVector);
31187 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
31188 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
31189 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
31190 SDValue MMXSrc = InputVector.getOperand(0);
31192 // The bitcast source is a direct mmx result.
31193 if (MMXSrc.getValueType() == MVT::x86mmx)
31194 return DAG.getBitcast(VT, InputVector);
31197 // Detect mmx to i32 conversion through a v2i32 elt extract.
31198 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
31199 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
31200 SDValue MMXSrc = InputVector.getOperand(0);
31202 // The bitcast source is a direct mmx result.
31203 if (MMXSrc.getValueType() == MVT::x86mmx)
31204 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
31207 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
31208 isa<ConstantSDNode>(EltIdx) &&
31209 isa<ConstantSDNode>(InputVector.getOperand(0))) {
31210 uint64_t ExtractedElt = N->getConstantOperandVal(1);
31211 uint64_t InputValue = InputVector.getConstantOperandVal(0);
31212 uint64_t Res = (InputValue >> ExtractedElt) & 1;
31213 return DAG.getConstant(Res, dl, MVT::i1);
31216 // Check whether this extract is the root of a sum of absolute differences
31217 // pattern. This has to be done here because we really want it to happen
31218 // pre-legalization,
31219 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
31222 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
31223 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
31226 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
31227 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
31230 // Only operate on vectors of 4 elements, where the alternative shuffling
31231 // gets to be more expensive.
31232 if (SrcVT != MVT::v4i32)
31235 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
31236 // single use which is a sign-extend or zero-extend, and all elements are
31238 SmallVector<SDNode *, 4> Uses;
31239 unsigned ExtractedElements = 0;
31240 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
31241 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
31242 if (UI.getUse().getResNo() != InputVector.getResNo())
31245 SDNode *Extract = *UI;
31246 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
31249 if (Extract->getValueType(0) != MVT::i32)
31251 if (!Extract->hasOneUse())
31253 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
31254 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
31256 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
31259 // Record which element was extracted.
31260 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
31261 Uses.push_back(Extract);
31264 // If not all the elements were used, this may not be worthwhile.
31265 if (ExtractedElements != 15)
31268 // Ok, we've now decided to do the transformation.
31269 // If 64-bit shifts are legal, use the extract-shift sequence,
31270 // otherwise bounce the vector off the cache.
31271 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31274 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
31275 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
31276 auto &DL = DAG.getDataLayout();
31277 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
31278 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
31279 DAG.getConstant(0, dl, VecIdxTy));
31280 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
31281 DAG.getConstant(1, dl, VecIdxTy));
31283 SDValue ShAmt = DAG.getConstant(
31284 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
31285 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
31286 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
31287 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
31288 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
31289 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
31290 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
31292 // Store the value to a temporary stack slot.
31293 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
31294 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
31295 MachinePointerInfo());
31297 EVT ElementType = SrcVT.getVectorElementType();
31298 unsigned EltSize = ElementType.getSizeInBits() / 8;
31300 // Replace each use (extract) with a load of the appropriate element.
31301 for (unsigned i = 0; i < 4; ++i) {
31302 uint64_t Offset = EltSize * i;
31303 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
31304 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
31306 SDValue ScalarAddr =
31307 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
31309 // Load the scalar.
31311 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
31315 // Replace the extracts
31316 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
31317 UE = Uses.end(); UI != UE; ++UI) {
31318 SDNode *Extract = *UI;
31320 uint64_t IdxVal = Extract->getConstantOperandVal(1);
31321 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
31324 // The replacement was made in place; return N so it won't be revisited.
31325 return SDValue(N, 0);
31328 /// If a vector select has an operand that is -1 or 0, try to simplify the
31329 /// select to a bitwise logic operation.
31330 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
31332 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
31333 TargetLowering::DAGCombinerInfo &DCI,
31334 const X86Subtarget &Subtarget) {
31335 SDValue Cond = N->getOperand(0);
31336 SDValue LHS = N->getOperand(1);
31337 SDValue RHS = N->getOperand(2);
31338 EVT VT = LHS.getValueType();
31339 EVT CondVT = Cond.getValueType();
31341 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31343 if (N->getOpcode() != ISD::VSELECT)
31346 assert(CondVT.isVector() && "Vector select expects a vector selector!");
31348 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
31349 // Check if the first operand is all zeros and Cond type is vXi1.
31350 // This situation only applies to avx512.
31351 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
31352 CondVT.getVectorElementType() == MVT::i1) {
31353 // Invert the cond to not(cond) : xor(op,allones)=not(op)
31354 SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
31355 DAG.getAllOnesConstant(DL, CondVT));
31356 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
31357 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
31360 // To use the condition operand as a bitwise mask, it must have elements that
31361 // are the same size as the select elements. Ie, the condition operand must
31362 // have already been promoted from the IR select condition type <N x i1>.
31363 // Don't check if the types themselves are equal because that excludes
31364 // vector floating-point selects.
31365 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
31368 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
31369 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
31371 // Try to invert the condition if true value is not all 1s and false value is
31373 if (!TValIsAllOnes && !FValIsAllZeros &&
31374 // Check if the selector will be produced by CMPP*/PCMP*.
31375 Cond.getOpcode() == ISD::SETCC &&
31376 // Check if SETCC has already been promoted.
31377 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
31379 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
31381 if (TValIsAllZeros || FValIsAllOnes) {
31382 SDValue CC = Cond.getOperand(2);
31383 ISD::CondCode NewCC =
31384 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
31385 Cond.getOperand(0).getValueType().isInteger());
31386 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
31388 std::swap(LHS, RHS);
31389 TValIsAllOnes = FValIsAllOnes;
31390 FValIsAllZeros = TValIsAllZeros;
31394 // Cond value must be 'sign splat' to be converted to a logical op.
31395 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
31398 // vselect Cond, 111..., 000... -> Cond
31399 if (TValIsAllOnes && FValIsAllZeros)
31400 return DAG.getBitcast(VT, Cond);
31402 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
31405 // vselect Cond, 111..., X -> or Cond, X
31406 if (TValIsAllOnes) {
31407 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
31408 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
31409 return DAG.getBitcast(VT, Or);
31412 // vselect Cond, X, 000... -> and Cond, X
31413 if (FValIsAllZeros) {
31414 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
31415 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
31416 return DAG.getBitcast(VT, And);
31419 // vselect Cond, 000..., X -> andn Cond, X
31420 if (TValIsAllZeros) {
31421 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
31422 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
31423 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
31424 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
31425 return DAG.getBitcast(VT, AndN);
31431 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
31432 SDValue Cond = N->getOperand(0);
31433 SDValue LHS = N->getOperand(1);
31434 SDValue RHS = N->getOperand(2);
31437 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
31438 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
31439 if (!TrueC || !FalseC)
31442 // Don't do this for crazy integer types.
31443 EVT VT = N->getValueType(0);
31444 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31447 // We're going to use the condition bit in math or logic ops. We could allow
31448 // this with a wider condition value (post-legalization it becomes an i8),
31449 // but if nothing is creating selects that late, it doesn't matter.
31450 if (Cond.getValueType() != MVT::i1)
31453 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
31454 // 3, 5, or 9 with i32/i64, so those get transformed too.
31455 // TODO: For constants that overflow or do not differ by power-of-2 or small
31456 // multiplier, convert to 'and' + 'add'.
31457 const APInt &TrueVal = TrueC->getAPIntValue();
31458 const APInt &FalseVal = FalseC->getAPIntValue();
31460 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
31464 APInt AbsDiff = Diff.abs();
31465 if (AbsDiff.isPowerOf2() ||
31466 ((VT == MVT::i32 || VT == MVT::i64) &&
31467 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
31469 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
31470 // of the condition can usually be folded into a compare predicate, but even
31471 // without that, the sequence should be cheaper than a CMOV alternative.
31472 if (TrueVal.slt(FalseVal)) {
31473 Cond = DAG.getNOT(DL, Cond, MVT::i1);
31474 std::swap(TrueC, FalseC);
31477 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
31478 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
31480 // Multiply condition by the difference if non-one.
31481 if (!AbsDiff.isOneValue())
31482 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
31484 // Add the base if non-zero.
31485 if (!FalseC->isNullValue())
31486 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
31494 // If this is a bitcasted op that can be represented as another type, push the
31495 // the bitcast to the inputs. This allows more opportunities for pattern
31496 // matching masked instructions. This is called when we know that the operation
31497 // is used as one of the inputs of a vselect.
31498 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
31499 TargetLowering::DAGCombinerInfo &DCI) {
31500 // Make sure we have a bitcast.
31501 if (OrigOp.getOpcode() != ISD::BITCAST)
31504 SDValue Op = OrigOp.getOperand(0);
31506 // If the operation is used by anything other than the bitcast, we shouldn't
31507 // do this combine as that would replicate the operation.
31508 if (!Op.hasOneUse())
31511 MVT VT = OrigOp.getSimpleValueType();
31512 MVT EltVT = VT.getVectorElementType();
31513 SDLoc DL(Op.getNode());
31515 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
31517 Op0 = DAG.getBitcast(VT, Op0);
31518 DCI.AddToWorklist(Op0.getNode());
31519 Op1 = DAG.getBitcast(VT, Op1);
31520 DCI.AddToWorklist(Op1.getNode());
31521 DCI.CombineTo(OrigOp.getNode(),
31522 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
31526 unsigned Opcode = Op.getOpcode();
31528 case X86ISD::SHUF128: {
31529 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
31531 // Only change element size, not type.
31532 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
31534 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
31537 case X86ISD::SUBV_BROADCAST: {
31538 unsigned EltSize = EltVT.getSizeInBits();
31539 if (EltSize != 32 && EltSize != 64)
31541 // Only change element size, not type.
31542 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
31544 SDValue Op0 = Op.getOperand(0);
31545 MVT Op0VT = MVT::getVectorVT(EltVT,
31546 Op0.getSimpleValueType().getSizeInBits() / EltSize);
31547 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
31548 DCI.AddToWorklist(Op0.getNode());
31549 DCI.CombineTo(OrigOp.getNode(),
31550 DAG.getNode(Opcode, DL, VT, Op0));
31558 /// Do target-specific dag combines on SELECT and VSELECT nodes.
31559 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
31560 TargetLowering::DAGCombinerInfo &DCI,
31561 const X86Subtarget &Subtarget) {
31563 SDValue Cond = N->getOperand(0);
31564 // Get the LHS/RHS of the select.
31565 SDValue LHS = N->getOperand(1);
31566 SDValue RHS = N->getOperand(2);
31567 EVT VT = LHS.getValueType();
31568 EVT CondVT = Cond.getValueType();
31569 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31571 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
31572 // instructions match the semantics of the common C idiom x<y?x:y but not
31573 // x<=y?x:y, because of how they handle negative zero (which can be
31574 // ignored in unsafe-math mode).
31575 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
31576 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
31577 VT != MVT::f80 && VT != MVT::f128 &&
31578 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
31579 (Subtarget.hasSSE2() ||
31580 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
31581 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31583 unsigned Opcode = 0;
31584 // Check for x CC y ? x : y.
31585 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
31586 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
31590 // Converting this to a min would handle NaNs incorrectly, and swapping
31591 // the operands would cause it to handle comparisons between positive
31592 // and negative zero incorrectly.
31593 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
31594 if (!DAG.getTarget().Options.UnsafeFPMath &&
31595 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
31597 std::swap(LHS, RHS);
31599 Opcode = X86ISD::FMIN;
31602 // Converting this to a min would handle comparisons between positive
31603 // and negative zero incorrectly.
31604 if (!DAG.getTarget().Options.UnsafeFPMath &&
31605 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
31607 Opcode = X86ISD::FMIN;
31610 // Converting this to a min would handle both negative zeros and NaNs
31611 // incorrectly, but we can swap the operands to fix both.
31612 std::swap(LHS, RHS);
31617 Opcode = X86ISD::FMIN;
31621 // Converting this to a max would handle comparisons between positive
31622 // and negative zero incorrectly.
31623 if (!DAG.getTarget().Options.UnsafeFPMath &&
31624 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
31626 Opcode = X86ISD::FMAX;
31629 // Converting this to a max would handle NaNs incorrectly, and swapping
31630 // the operands would cause it to handle comparisons between positive
31631 // and negative zero incorrectly.
31632 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
31633 if (!DAG.getTarget().Options.UnsafeFPMath &&
31634 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
31636 std::swap(LHS, RHS);
31638 Opcode = X86ISD::FMAX;
31641 // Converting this to a max would handle both negative zeros and NaNs
31642 // incorrectly, but we can swap the operands to fix both.
31643 std::swap(LHS, RHS);
31648 Opcode = X86ISD::FMAX;
31651 // Check for x CC y ? y : x -- a min/max with reversed arms.
31652 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
31653 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
31657 // Converting this to a min would handle comparisons between positive
31658 // and negative zero incorrectly, and swapping the operands would
31659 // cause it to handle NaNs incorrectly.
31660 if (!DAG.getTarget().Options.UnsafeFPMath &&
31661 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
31662 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31664 std::swap(LHS, RHS);
31666 Opcode = X86ISD::FMIN;
31669 // Converting this to a min would handle NaNs incorrectly.
31670 if (!DAG.getTarget().Options.UnsafeFPMath &&
31671 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
31673 Opcode = X86ISD::FMIN;
31676 // Converting this to a min would handle both negative zeros and NaNs
31677 // incorrectly, but we can swap the operands to fix both.
31678 std::swap(LHS, RHS);
31683 Opcode = X86ISD::FMIN;
31687 // Converting this to a max would handle NaNs incorrectly.
31688 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31690 Opcode = X86ISD::FMAX;
31693 // Converting this to a max would handle comparisons between positive
31694 // and negative zero incorrectly, and swapping the operands would
31695 // cause it to handle NaNs incorrectly.
31696 if (!DAG.getTarget().Options.UnsafeFPMath &&
31697 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
31698 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31700 std::swap(LHS, RHS);
31702 Opcode = X86ISD::FMAX;
31705 // Converting this to a max would handle both negative zeros and NaNs
31706 // incorrectly, but we can swap the operands to fix both.
31707 std::swap(LHS, RHS);
31712 Opcode = X86ISD::FMAX;
31718 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
31721 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
31722 // lowering on KNL. In this case we convert it to
31723 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
31724 // The same situation for all 128 and 256-bit vectors of i8 and i16.
31725 // Since SKX these selects have a proper lowering.
31726 if (Subtarget.hasAVX512() && CondVT.isVector() &&
31727 CondVT.getVectorElementType() == MVT::i1 &&
31728 (VT.is128BitVector() || VT.is256BitVector()) &&
31729 (VT.getVectorElementType() == MVT::i8 ||
31730 VT.getVectorElementType() == MVT::i16) &&
31731 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
31732 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
31733 DCI.AddToWorklist(Cond.getNode());
31734 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
31737 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
31740 // Canonicalize max and min:
31741 // (x > y) ? x : y -> (x >= y) ? x : y
31742 // (x < y) ? x : y -> (x <= y) ? x : y
31743 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
31744 // the need for an extra compare
31745 // against zero. e.g.
31746 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
31748 // testl %edi, %edi
31750 // cmovgl %edi, %eax
31754 // cmovsl %eax, %edi
31755 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
31756 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
31757 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
31758 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31763 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
31764 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
31765 Cond.getOperand(0), Cond.getOperand(1), NewCC);
31766 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
31771 // Early exit check
31772 if (!TLI.isTypeLegal(VT))
31775 // Match VSELECTs into subs with unsigned saturation.
31776 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
31777 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
31778 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
31779 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
31780 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31782 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
31783 // left side invert the predicate to simplify logic below.
31785 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
31787 CC = ISD::getSetCCInverse(CC, true);
31788 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
31792 if (Other.getNode() && Other->getNumOperands() == 2 &&
31793 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
31794 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
31795 SDValue CondRHS = Cond->getOperand(1);
31797 // Look for a general sub with unsigned saturation first.
31798 // x >= y ? x-y : 0 --> subus x, y
31799 // x > y ? x-y : 0 --> subus x, y
31800 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
31801 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
31802 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
31804 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
31805 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
31806 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
31807 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
31808 // If the RHS is a constant we have to reverse the const
31809 // canonicalization.
31810 // x > C-1 ? x+-C : 0 --> subus x, C
31811 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
31812 CondRHSConst->getAPIntValue() ==
31813 (-OpRHSConst->getAPIntValue() - 1))
31814 return DAG.getNode(
31815 X86ISD::SUBUS, DL, VT, OpLHS,
31816 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
31818 // Another special case: If C was a sign bit, the sub has been
31819 // canonicalized into a xor.
31820 // FIXME: Would it be better to use computeKnownBits to determine
31821 // whether it's safe to decanonicalize the xor?
31822 // x s< 0 ? x^C : 0 --> subus x, C
31823 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
31824 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
31825 OpRHSConst->getAPIntValue().isSignMask())
31826 // Note that we have to rebuild the RHS constant here to ensure we
31827 // don't rely on particular values of undef lanes.
31828 return DAG.getNode(
31829 X86ISD::SUBUS, DL, VT, OpLHS,
31830 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
31835 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
31838 // If this is a *dynamic* select (non-constant condition) and we can match
31839 // this node with one of the variable blend instructions, restructure the
31840 // condition so that blends can use the high (sign) bit of each element and
31841 // use SimplifyDemandedBits to simplify the condition operand.
31842 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
31843 !DCI.isBeforeLegalize() &&
31844 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
31845 unsigned BitWidth = Cond.getScalarValueSizeInBits();
31847 // Don't optimize vector selects that map to mask-registers.
31851 // We can only handle the cases where VSELECT is directly legal on the
31852 // subtarget. We custom lower VSELECT nodes with constant conditions and
31853 // this makes it hard to see whether a dynamic VSELECT will correctly
31854 // lower, so we both check the operation's status and explicitly handle the
31855 // cases where a *dynamic* blend will fail even though a constant-condition
31856 // blend could be custom lowered.
31857 // FIXME: We should find a better way to handle this class of problems.
31858 // Potentially, we should combine constant-condition vselect nodes
31859 // pre-legalization into shuffles and not mark as many types as custom
31861 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
31863 // FIXME: We don't support i16-element blends currently. We could and
31864 // should support them by making *all* the bits in the condition be set
31865 // rather than just the high bit and using an i8-element blend.
31866 if (VT.getVectorElementType() == MVT::i16)
31868 // Dynamic blending was only available from SSE4.1 onward.
31869 if (VT.is128BitVector() && !Subtarget.hasSSE41())
31871 // Byte blends are only available in AVX2
31872 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
31874 // There are no 512-bit blend instructions that use sign bits.
31875 if (VT.is512BitVector())
31878 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
31879 APInt DemandedMask(APInt::getSignMask(BitWidth));
31881 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
31882 !DCI.isBeforeLegalizeOps());
31883 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
31884 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
31885 // If we changed the computation somewhere in the DAG, this change will
31886 // affect all users of Cond. Make sure it is fine and update all the nodes
31887 // so that we do not use the generic VSELECT anymore. Otherwise, we may
31888 // perform wrong optimizations as we messed with the actual expectation
31889 // for the vector boolean values.
31890 if (Cond != TLO.Old) {
31891 // Check all uses of the condition operand to check whether it will be
31892 // consumed by non-BLEND instructions. Those may require that all bits
31893 // are set properly.
31894 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
31896 // TODO: Add other opcodes eventually lowered into BLEND.
31897 if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
31901 // Update all users of the condition before committing the change, so
31902 // that the VSELECT optimizations that expect the correct vector boolean
31903 // value will not be triggered.
31904 for (SDNode *U : Cond->uses()) {
31905 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
31906 U->getValueType(0), Cond, U->getOperand(1),
31908 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
31910 DCI.CommitTargetLoweringOpt(TLO);
31913 // Only Cond (rather than other nodes in the computation chain) was
31914 // changed. Change the condition just for N to keep the opportunity to
31915 // optimize all other users their own way.
31916 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
31917 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
31922 // Look for vselects with LHS/RHS being bitcasted from an operation that
31923 // can be executed on another type. Push the bitcast to the inputs of
31924 // the operation. This exposes opportunities for using masking instructions.
31925 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
31926 CondVT.getVectorElementType() == MVT::i1) {
31927 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
31928 return SDValue(N, 0);
31929 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
31930 return SDValue(N, 0);
31933 // Custom action for SELECT MMX
31934 if (VT == MVT::x86mmx) {
31935 LHS = DAG.getBitcast(MVT::i64, LHS);
31936 RHS = DAG.getBitcast(MVT::i64, RHS);
31937 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
31938 return DAG.getBitcast(VT, newSelect);
31945 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
31947 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
31948 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
31949 /// Note that this is only legal for some op/cc combinations.
31950 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
31952 const X86Subtarget &Subtarget) {
31953 // This combine only operates on CMP-like nodes.
31954 if (!(Cmp.getOpcode() == X86ISD::CMP ||
31955 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
31958 // Can't replace the cmp if it has more uses than the one we're looking at.
31959 // FIXME: We would like to be able to handle this, but would need to make sure
31960 // all uses were updated.
31961 if (!Cmp.hasOneUse())
31964 // This only applies to variations of the common case:
31965 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
31966 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
31967 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
31968 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
31969 // Using the proper condcodes (see below), overflow is checked for.
31971 // FIXME: We can generalize both constraints:
31972 // - XOR/OR/AND (if they were made to survive AtomicExpand)
31974 // if the result is compared.
31976 SDValue CmpLHS = Cmp.getOperand(0);
31977 SDValue CmpRHS = Cmp.getOperand(1);
31979 if (!CmpLHS.hasOneUse())
31982 unsigned Opc = CmpLHS.getOpcode();
31983 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
31986 SDValue OpRHS = CmpLHS.getOperand(2);
31987 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
31991 APInt Addend = OpRHSC->getAPIntValue();
31992 if (Opc == ISD::ATOMIC_LOAD_SUB)
31995 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
31999 APInt Comparison = CmpRHSC->getAPIntValue();
32001 // If the addend is the negation of the comparison value, then we can do
32002 // a full comparison by emitting the atomic arithmetic as a locked sub.
32003 if (Comparison == -Addend) {
32004 // The CC is fine, but we need to rewrite the LHS of the comparison as an
32006 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
32007 auto AtomicSub = DAG.getAtomic(
32008 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
32009 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
32010 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
32011 AN->getMemOperand());
32012 // If the comparision uses the CF flag we can't use INC/DEC instructions.
32013 bool NeedCF = false;
32016 case X86::COND_A: case X86::COND_AE:
32017 case X86::COND_B: case X86::COND_BE:
32021 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
32022 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
32023 DAG.getUNDEF(CmpLHS.getValueType()));
32024 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
32028 // We can handle comparisons with zero in a number of cases by manipulating
32030 if (!Comparison.isNullValue())
32033 if (CC == X86::COND_S && Addend == 1)
32035 else if (CC == X86::COND_NS && Addend == 1)
32037 else if (CC == X86::COND_G && Addend == -1)
32039 else if (CC == X86::COND_LE && Addend == -1)
32044 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
32045 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
32046 DAG.getUNDEF(CmpLHS.getValueType()));
32047 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
32051 // Check whether a boolean test is testing a boolean value generated by
32052 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
32055 // Simplify the following patterns:
32056 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
32057 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
32058 // to (Op EFLAGS Cond)
32060 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
32061 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
32062 // to (Op EFLAGS !Cond)
32064 // where Op could be BRCOND or CMOV.
32066 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
32067 // This combine only operates on CMP-like nodes.
32068 if (!(Cmp.getOpcode() == X86ISD::CMP ||
32069 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
32072 // Quit if not used as a boolean value.
32073 if (CC != X86::COND_E && CC != X86::COND_NE)
32076 // Check CMP operands. One of them should be 0 or 1 and the other should be
32077 // an SetCC or extended from it.
32078 SDValue Op1 = Cmp.getOperand(0);
32079 SDValue Op2 = Cmp.getOperand(1);
32082 const ConstantSDNode* C = nullptr;
32083 bool needOppositeCond = (CC == X86::COND_E);
32084 bool checkAgainstTrue = false; // Is it a comparison against 1?
32086 if ((C = dyn_cast<ConstantSDNode>(Op1)))
32088 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
32090 else // Quit if all operands are not constants.
32093 if (C->getZExtValue() == 1) {
32094 needOppositeCond = !needOppositeCond;
32095 checkAgainstTrue = true;
32096 } else if (C->getZExtValue() != 0)
32097 // Quit if the constant is neither 0 or 1.
32100 bool truncatedToBoolWithAnd = false;
32101 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
32102 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
32103 SetCC.getOpcode() == ISD::TRUNCATE ||
32104 SetCC.getOpcode() == ISD::AND) {
32105 if (SetCC.getOpcode() == ISD::AND) {
32107 if (isOneConstant(SetCC.getOperand(0)))
32109 if (isOneConstant(SetCC.getOperand(1)))
32113 SetCC = SetCC.getOperand(OpIdx);
32114 truncatedToBoolWithAnd = true;
32116 SetCC = SetCC.getOperand(0);
32119 switch (SetCC.getOpcode()) {
32120 case X86ISD::SETCC_CARRY:
32121 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
32122 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
32123 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
32124 // truncated to i1 using 'and'.
32125 if (checkAgainstTrue && !truncatedToBoolWithAnd)
32127 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
32128 "Invalid use of SETCC_CARRY!");
32130 case X86ISD::SETCC:
32131 // Set the condition code or opposite one if necessary.
32132 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
32133 if (needOppositeCond)
32134 CC = X86::GetOppositeBranchCondition(CC);
32135 return SetCC.getOperand(1);
32136 case X86ISD::CMOV: {
32137 // Check whether false/true value has canonical one, i.e. 0 or 1.
32138 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
32139 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
32140 // Quit if true value is not a constant.
32143 // Quit if false value is not a constant.
32145 SDValue Op = SetCC.getOperand(0);
32146 // Skip 'zext' or 'trunc' node.
32147 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
32148 Op.getOpcode() == ISD::TRUNCATE)
32149 Op = Op.getOperand(0);
32150 // A special case for rdrand/rdseed, where 0 is set if false cond is
32152 if ((Op.getOpcode() != X86ISD::RDRAND &&
32153 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
32156 // Quit if false value is not the constant 0 or 1.
32157 bool FValIsFalse = true;
32158 if (FVal && FVal->getZExtValue() != 0) {
32159 if (FVal->getZExtValue() != 1)
32161 // If FVal is 1, opposite cond is needed.
32162 needOppositeCond = !needOppositeCond;
32163 FValIsFalse = false;
32165 // Quit if TVal is not the constant opposite of FVal.
32166 if (FValIsFalse && TVal->getZExtValue() != 1)
32168 if (!FValIsFalse && TVal->getZExtValue() != 0)
32170 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
32171 if (needOppositeCond)
32172 CC = X86::GetOppositeBranchCondition(CC);
32173 return SetCC.getOperand(3);
32180 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
32182 /// (X86or (X86setcc) (X86setcc))
32183 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
32184 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
32185 X86::CondCode &CC1, SDValue &Flags,
32187 if (Cond->getOpcode() == X86ISD::CMP) {
32188 if (!isNullConstant(Cond->getOperand(1)))
32191 Cond = Cond->getOperand(0);
32196 SDValue SetCC0, SetCC1;
32197 switch (Cond->getOpcode()) {
32198 default: return false;
32205 SetCC0 = Cond->getOperand(0);
32206 SetCC1 = Cond->getOperand(1);
32210 // Make sure we have SETCC nodes, using the same flags value.
32211 if (SetCC0.getOpcode() != X86ISD::SETCC ||
32212 SetCC1.getOpcode() != X86ISD::SETCC ||
32213 SetCC0->getOperand(1) != SetCC1->getOperand(1))
32216 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
32217 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
32218 Flags = SetCC0->getOperand(1);
32222 // When legalizing carry, we create carries via add X, -1
32223 // If that comes from an actual carry, via setcc, we use the
32225 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
32226 if (EFLAGS.getOpcode() == X86ISD::ADD) {
32227 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
32228 SDValue Carry = EFLAGS.getOperand(0);
32229 while (Carry.getOpcode() == ISD::TRUNCATE ||
32230 Carry.getOpcode() == ISD::ZERO_EXTEND ||
32231 Carry.getOpcode() == ISD::SIGN_EXTEND ||
32232 Carry.getOpcode() == ISD::ANY_EXTEND ||
32233 (Carry.getOpcode() == ISD::AND &&
32234 isOneConstant(Carry.getOperand(1))))
32235 Carry = Carry.getOperand(0);
32236 if (Carry.getOpcode() == X86ISD::SETCC ||
32237 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
32238 if (Carry.getConstantOperandVal(0) == X86::COND_B)
32239 return Carry.getOperand(1);
32247 /// Optimize an EFLAGS definition used according to the condition code \p CC
32248 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
32249 /// uses of chain values.
32250 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
32252 const X86Subtarget &Subtarget) {
32253 if (CC == X86::COND_B)
32254 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
32257 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
32259 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
32262 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
32263 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
32264 TargetLowering::DAGCombinerInfo &DCI,
32265 const X86Subtarget &Subtarget) {
32268 SDValue FalseOp = N->getOperand(0);
32269 SDValue TrueOp = N->getOperand(1);
32270 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
32271 SDValue Cond = N->getOperand(3);
32273 if (CC == X86::COND_E || CC == X86::COND_NE) {
32274 switch (Cond.getOpcode()) {
32278 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
32279 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
32280 return (CC == X86::COND_E) ? FalseOp : TrueOp;
32284 // Try to simplify the EFLAGS and condition code operands.
32285 // We can't always do this as FCMOV only supports a subset of X86 cond.
32286 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
32287 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
32288 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
32290 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32294 // If this is a select between two integer constants, try to do some
32295 // optimizations. Note that the operands are ordered the opposite of SELECT
32297 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
32298 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
32299 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
32300 // larger than FalseC (the false value).
32301 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
32302 CC = X86::GetOppositeBranchCondition(CC);
32303 std::swap(TrueC, FalseC);
32304 std::swap(TrueOp, FalseOp);
32307 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
32308 // This is efficient for any integer data type (including i8/i16) and
32310 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
32311 Cond = getSETCC(CC, Cond, DL, DAG);
32313 // Zero extend the condition if needed.
32314 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
32316 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
32317 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
32318 DAG.getConstant(ShAmt, DL, MVT::i8));
32322 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
32323 // for any integer data type, including i8/i16.
32324 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
32325 Cond = getSETCC(CC, Cond, DL, DAG);
32327 // Zero extend the condition if needed.
32328 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
32329 FalseC->getValueType(0), Cond);
32330 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
32331 SDValue(FalseC, 0));
32335 // Optimize cases that will turn into an LEA instruction. This requires
32336 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
32337 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
32338 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
32339 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
32341 bool isFastMultiplier = false;
32343 switch ((unsigned char)Diff) {
32345 case 1: // result = add base, cond
32346 case 2: // result = lea base( , cond*2)
32347 case 3: // result = lea base(cond, cond*2)
32348 case 4: // result = lea base( , cond*4)
32349 case 5: // result = lea base(cond, cond*4)
32350 case 8: // result = lea base( , cond*8)
32351 case 9: // result = lea base(cond, cond*8)
32352 isFastMultiplier = true;
32357 if (isFastMultiplier) {
32358 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
32359 Cond = getSETCC(CC, Cond, DL ,DAG);
32360 // Zero extend the condition if needed.
32361 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
32363 // Scale the condition by the difference.
32365 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
32366 DAG.getConstant(Diff, DL, Cond.getValueType()));
32368 // Add the base if non-zero.
32369 if (FalseC->getAPIntValue() != 0)
32370 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
32371 SDValue(FalseC, 0));
32378 // Handle these cases:
32379 // (select (x != c), e, c) -> select (x != c), e, x),
32380 // (select (x == c), c, e) -> select (x == c), x, e)
32381 // where the c is an integer constant, and the "select" is the combination
32382 // of CMOV and CMP.
32384 // The rationale for this change is that the conditional-move from a constant
32385 // needs two instructions, however, conditional-move from a register needs
32386 // only one instruction.
32388 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
32389 // some instruction-combining opportunities. This opt needs to be
32390 // postponed as late as possible.
32392 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
32393 // the DCI.xxxx conditions are provided to postpone the optimization as
32394 // late as possible.
32396 ConstantSDNode *CmpAgainst = nullptr;
32397 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
32398 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
32399 !isa<ConstantSDNode>(Cond.getOperand(0))) {
32401 if (CC == X86::COND_NE &&
32402 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
32403 CC = X86::GetOppositeBranchCondition(CC);
32404 std::swap(TrueOp, FalseOp);
32407 if (CC == X86::COND_E &&
32408 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
32409 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
32410 DAG.getConstant(CC, DL, MVT::i8), Cond };
32411 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32416 // Fold and/or of setcc's to double CMOV:
32417 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
32418 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
32420 // This combine lets us generate:
32421 // cmovcc1 (jcc1 if we don't have CMOV)
32427 // cmovne (jne if we don't have CMOV)
32428 // When we can't use the CMOV instruction, it might increase branch
32430 // When we can use CMOV, or when there is no mispredict, this improves
32431 // throughput and reduces register pressure.
32433 if (CC == X86::COND_NE) {
32435 X86::CondCode CC0, CC1;
32437 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
32439 std::swap(FalseOp, TrueOp);
32440 CC0 = X86::GetOppositeBranchCondition(CC0);
32441 CC1 = X86::GetOppositeBranchCondition(CC1);
32444 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
32446 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
32447 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
32448 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32456 /// Different mul shrinking modes.
32457 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
32459 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
32460 EVT VT = N->getOperand(0).getValueType();
32461 if (VT.getScalarSizeInBits() != 32)
32464 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
32465 unsigned SignBits[2] = {1, 1};
32466 bool IsPositive[2] = {false, false};
32467 for (unsigned i = 0; i < 2; i++) {
32468 SDValue Opd = N->getOperand(i);
32470 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
32471 // compute signbits for it separately.
32472 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
32473 // For anyextend, it is safe to assume an appropriate number of leading
32475 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
32477 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
32482 IsPositive[i] = true;
32483 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
32484 // All the operands of BUILD_VECTOR need to be int constant.
32485 // Find the smallest value range which all the operands belong to.
32487 IsPositive[i] = true;
32488 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
32489 if (SubOp.isUndef())
32491 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
32494 APInt IntVal = CN->getAPIntValue();
32495 if (IntVal.isNegative())
32496 IsPositive[i] = false;
32497 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
32500 SignBits[i] = DAG.ComputeNumSignBits(Opd);
32501 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
32502 IsPositive[i] = true;
32506 bool AllPositive = IsPositive[0] && IsPositive[1];
32507 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
32508 // When ranges are from -128 ~ 127, use MULS8 mode.
32509 if (MinSignBits >= 25)
32511 // When ranges are from 0 ~ 255, use MULU8 mode.
32512 else if (AllPositive && MinSignBits >= 24)
32514 // When ranges are from -32768 ~ 32767, use MULS16 mode.
32515 else if (MinSignBits >= 17)
32517 // When ranges are from 0 ~ 65535, use MULU16 mode.
32518 else if (AllPositive && MinSignBits >= 16)
32525 /// When the operands of vector mul are extended from smaller size values,
32526 /// like i8 and i16, the type of mul may be shrinked to generate more
32527 /// efficient code. Two typical patterns are handled:
32529 /// %2 = sext/zext <N x i8> %1 to <N x i32>
32530 /// %4 = sext/zext <N x i8> %3 to <N x i32>
32531 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32532 /// %5 = mul <N x i32> %2, %4
32535 /// %2 = zext/sext <N x i16> %1 to <N x i32>
32536 /// %4 = zext/sext <N x i16> %3 to <N x i32>
32537 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32538 /// %5 = mul <N x i32> %2, %4
32540 /// There are four mul shrinking modes:
32541 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
32542 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
32543 /// generate pmullw+sext32 for it (MULS8 mode).
32544 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
32545 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
32546 /// generate pmullw+zext32 for it (MULU8 mode).
32547 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
32548 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
32549 /// generate pmullw+pmulhw for it (MULS16 mode).
32550 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
32551 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
32552 /// generate pmullw+pmulhuw for it (MULU16 mode).
32553 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
32554 const X86Subtarget &Subtarget) {
32555 // Check for legality
32556 // pmullw/pmulhw are not supported by SSE.
32557 if (!Subtarget.hasSSE2())
32560 // Check for profitability
32561 // pmulld is supported since SSE41. It is better to use pmulld
32562 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
32564 bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
32565 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
32569 if (!canReduceVMulWidth(N, DAG, Mode))
32573 SDValue N0 = N->getOperand(0);
32574 SDValue N1 = N->getOperand(1);
32575 EVT VT = N->getOperand(0).getValueType();
32576 unsigned NumElts = VT.getVectorNumElements();
32577 if ((NumElts % 2) != 0)
32580 // If the upper 17 bits of each element are zero then we can use PMADD.
32581 APInt Mask17 = APInt::getHighBitsSet(32, 17);
32582 if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) &&
32583 DAG.MaskedValueIsZero(N1, Mask17))
32584 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0),
32585 DAG.getBitcast(MVT::v8i16, N1));
32587 unsigned RegSize = 128;
32588 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
32589 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
32591 // Shrink the operands of mul.
32592 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
32593 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
32595 if (NumElts >= OpsVT.getVectorNumElements()) {
32596 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
32597 // lower part is needed.
32598 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
32599 if (Mode == MULU8 || Mode == MULS8) {
32600 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
32603 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
32604 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
32605 // the higher part is also needed.
32606 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32607 ReducedVT, NewN0, NewN1);
32609 // Repack the lower part and higher part result of mul into a wider
32611 // Generate shuffle functioning as punpcklwd.
32612 SmallVector<int, 16> ShuffleMask(NumElts);
32613 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
32614 ShuffleMask[2 * i] = i;
32615 ShuffleMask[2 * i + 1] = i + NumElts;
32618 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
32619 ResLo = DAG.getBitcast(ResVT, ResLo);
32620 // Generate shuffle functioning as punpckhwd.
32621 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
32622 ShuffleMask[2 * i] = i + NumElts / 2;
32623 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
32626 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
32627 ResHi = DAG.getBitcast(ResVT, ResHi);
32628 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
32631 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
32632 // to legalize the mul explicitly because implicit legalization for type
32633 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
32634 // instructions which will not exist when we explicitly legalize it by
32635 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
32636 // <4 x i16> undef).
32638 // Legalize the operands of mul.
32639 // FIXME: We may be able to handle non-concatenated vectors by insertion.
32640 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
32641 if ((RegSize % ReducedSizeInBits) != 0)
32644 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
32645 DAG.getUNDEF(ReducedVT));
32647 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
32649 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
32651 if (Mode == MULU8 || Mode == MULS8) {
32652 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
32654 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
32656 // convert the type of mul result to VT.
32657 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
32658 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
32659 : ISD::SIGN_EXTEND_VECTOR_INREG,
32661 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
32662 DAG.getIntPtrConstant(0, DL));
32664 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
32665 // MULU16/MULS16, both parts are needed.
32666 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
32667 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32668 OpsVT, NewN0, NewN1);
32670 // Repack the lower part and higher part result of mul into a wider
32671 // result. Make sure the type of mul result is VT.
32672 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
32673 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
32674 Res = DAG.getBitcast(ResVT, Res);
32675 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
32676 DAG.getIntPtrConstant(0, DL));
32681 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
32682 EVT VT, SDLoc DL) {
32684 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
32685 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32686 DAG.getConstant(Mult, DL, VT));
32687 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
32688 DAG.getConstant(Shift, DL, MVT::i8));
32689 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
32694 auto combineMulMulAddOrSub = [&](bool isAdd) {
32695 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32696 DAG.getConstant(9, DL, VT));
32697 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
32698 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
32707 // mul x, 11 => add ((shl (mul x, 5), 1), x)
32708 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
32710 // mul x, 21 => add ((shl (mul x, 5), 2), x)
32711 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
32713 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
32714 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32715 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
32717 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
32718 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
32720 // mul x, 13 => add ((shl (mul x, 3), 2), x)
32721 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
32723 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
32724 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
32726 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
32727 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32728 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
32730 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
32731 return combineMulMulAddOrSub(/*isAdd*/ false);
32733 // mul x, 28 => add ((mul (mul x, 9), 3), x)
32734 return combineMulMulAddOrSub(/*isAdd*/ true);
32736 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
32737 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32738 combineMulMulAddOrSub(/*isAdd*/ true));
32740 // mul x, 30 => sub (sub ((shl x, 5), x), x)
32741 return DAG.getNode(
32743 DAG.getNode(ISD::SUB, DL, VT,
32744 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32745 DAG.getConstant(5, DL, MVT::i8)),
32752 /// Optimize a single multiply with constant into two operations in order to
32753 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
32754 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
32755 TargetLowering::DAGCombinerInfo &DCI,
32756 const X86Subtarget &Subtarget) {
32757 EVT VT = N->getValueType(0);
32758 if (DCI.isBeforeLegalize() && VT.isVector())
32759 return reduceVMULWidth(N, DAG, Subtarget);
32761 if (!MulConstantOptimization)
32763 // An imul is usually smaller than the alternative sequence.
32764 if (DAG.getMachineFunction().getFunction().optForMinSize())
32767 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
32770 if (VT != MVT::i64 && VT != MVT::i32)
32773 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
32776 uint64_t MulAmt = C->getZExtValue();
32777 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
32780 uint64_t MulAmt1 = 0;
32781 uint64_t MulAmt2 = 0;
32782 if ((MulAmt % 9) == 0) {
32784 MulAmt2 = MulAmt / 9;
32785 } else if ((MulAmt % 5) == 0) {
32787 MulAmt2 = MulAmt / 5;
32788 } else if ((MulAmt % 3) == 0) {
32790 MulAmt2 = MulAmt / 3;
32796 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
32798 if (isPowerOf2_64(MulAmt2) &&
32799 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
32800 // If second multiplifer is pow2, issue it first. We want the multiply by
32801 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
32803 std::swap(MulAmt1, MulAmt2);
32805 if (isPowerOf2_64(MulAmt1))
32806 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32807 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
32809 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32810 DAG.getConstant(MulAmt1, DL, VT));
32812 if (isPowerOf2_64(MulAmt2))
32813 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
32814 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
32816 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
32817 DAG.getConstant(MulAmt2, DL, VT));
32818 } else if (!Subtarget.slowLEA())
32819 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
32822 assert(MulAmt != 0 &&
32823 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
32824 "Both cases that could cause potential overflows should have "
32825 "already been handled.");
32826 int64_t SignMulAmt = C->getSExtValue();
32827 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
32828 (SignMulAmt != -INT64_MAX)) {
32829 int NumSign = SignMulAmt > 0 ? 1 : -1;
32830 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
32831 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
32832 if (IsPowerOf2_64PlusOne) {
32833 // (mul x, 2^N + 1) => (add (shl x, N), x)
32834 NewMul = DAG.getNode(
32835 ISD::ADD, DL, VT, N->getOperand(0),
32836 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32837 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
32839 } else if (IsPowerOf2_64MinusOne) {
32840 // (mul x, 2^N - 1) => (sub (shl x, N), x)
32841 NewMul = DAG.getNode(
32843 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32844 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
32848 // To negate, subtract the number from zero
32849 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
32851 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
32856 // Do not add new nodes to DAG combiner worklist.
32857 DCI.CombineTo(N, NewMul, false);
32862 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
32863 SDValue N0 = N->getOperand(0);
32864 SDValue N1 = N->getOperand(1);
32865 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
32866 EVT VT = N0.getValueType();
32868 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
32869 // since the result of setcc_c is all zero's or all ones.
32870 if (VT.isInteger() && !VT.isVector() &&
32871 N1C && N0.getOpcode() == ISD::AND &&
32872 N0.getOperand(1).getOpcode() == ISD::Constant) {
32873 SDValue N00 = N0.getOperand(0);
32874 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
32875 Mask <<= N1C->getAPIntValue();
32876 bool MaskOK = false;
32877 // We can handle cases concerning bit-widening nodes containing setcc_c if
32878 // we carefully interrogate the mask to make sure we are semantics
32880 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
32881 // of the underlying setcc_c operation if the setcc_c was zero extended.
32882 // Consider the following example:
32883 // zext(setcc_c) -> i32 0x0000FFFF
32884 // c1 -> i32 0x0000FFFF
32885 // c2 -> i32 0x00000001
32886 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
32887 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
32888 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
32890 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
32891 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
32893 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
32894 N00.getOpcode() == ISD::ANY_EXTEND) &&
32895 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
32896 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
32898 if (MaskOK && Mask != 0) {
32900 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
32904 // Hardware support for vector shifts is sparse which makes us scalarize the
32905 // vector operations in many cases. Also, on sandybridge ADD is faster than
32907 // (shl V, 1) -> add V,V
32908 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
32909 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
32910 assert(N0.getValueType().isVector() && "Invalid vector shift type");
32911 // We shift all of the values by one. In many cases we do not have
32912 // hardware support for this operation. This is better expressed as an ADD
32914 if (N1SplatC->getAPIntValue() == 1)
32915 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
32921 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
32922 SDValue N0 = N->getOperand(0);
32923 SDValue N1 = N->getOperand(1);
32924 EVT VT = N0.getValueType();
32925 unsigned Size = VT.getSizeInBits();
32927 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
32928 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
32929 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
32930 // depending on sign of (SarConst - [56,48,32,24,16])
32932 // sexts in X86 are MOVs. The MOVs have the same code size
32933 // as above SHIFTs (only SHIFT on 1 has lower code size).
32934 // However the MOVs have 2 advantages to a SHIFT:
32935 // 1. MOVs can write to a register that differs from source
32936 // 2. MOVs accept memory operands
32938 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
32939 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
32940 N0.getOperand(1).getOpcode() != ISD::Constant)
32943 SDValue N00 = N0.getOperand(0);
32944 SDValue N01 = N0.getOperand(1);
32945 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
32946 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
32947 EVT CVT = N1.getValueType();
32949 if (SarConst.isNegative())
32952 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
32953 unsigned ShiftSize = SVT.getSizeInBits();
32954 // skipping types without corresponding sext/zext and
32955 // ShlConst that is not one of [56,48,32,24,16]
32956 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
32960 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
32961 SarConst = SarConst - (Size - ShiftSize);
32964 else if (SarConst.isNegative())
32965 return DAG.getNode(ISD::SHL, DL, VT, NN,
32966 DAG.getConstant(-SarConst, DL, CVT));
32968 return DAG.getNode(ISD::SRA, DL, VT, NN,
32969 DAG.getConstant(SarConst, DL, CVT));
32974 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
32975 SDValue N0 = N->getOperand(0);
32976 SDValue N1 = N->getOperand(1);
32977 EVT VT = N0.getValueType();
32979 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
32980 // TODO: This is a generic DAG combine that became an x86-only combine to
32981 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
32982 // and-not ('andn').
32983 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
32986 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
32987 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
32988 if (!ShiftC || !AndC)
32991 // If we can shrink the constant mask below 8-bits or 32-bits, then this
32992 // transform should reduce code size. It may also enable secondary transforms
32993 // from improved known-bits analysis or instruction selection.
32994 APInt MaskVal = AndC->getAPIntValue();
32995 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
32996 unsigned OldMaskSize = MaskVal.getMinSignedBits();
32997 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
32998 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
32999 (OldMaskSize > 32 && NewMaskSize <= 32)) {
33000 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
33002 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
33003 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
33004 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
33009 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
33010 TargetLowering::DAGCombinerInfo &DCI,
33011 const X86Subtarget &Subtarget) {
33012 if (N->getOpcode() == ISD::SHL)
33013 if (SDValue V = combineShiftLeft(N, DAG))
33016 if (N->getOpcode() == ISD::SRA)
33017 if (SDValue V = combineShiftRightArithmetic(N, DAG))
33020 if (N->getOpcode() == ISD::SRL)
33021 if (SDValue V = combineShiftRightLogical(N, DAG))
33027 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
33028 TargetLowering::DAGCombinerInfo &DCI,
33029 const X86Subtarget &Subtarget) {
33030 unsigned Opcode = N->getOpcode();
33031 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
33032 "Unexpected shift opcode");
33034 EVT VT = N->getValueType(0);
33035 SDValue N0 = N->getOperand(0);
33036 SDValue N1 = N->getOperand(1);
33037 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
33038 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
33039 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
33040 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
33041 "Unexpected PACKSS/PACKUS input type");
33043 // Constant Folding.
33044 APInt UndefElts0, UndefElts1;
33045 SmallVector<APInt, 32> EltBits0, EltBits1;
33046 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
33047 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
33048 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
33049 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
33050 unsigned NumLanes = VT.getSizeInBits() / 128;
33051 unsigned NumDstElts = VT.getVectorNumElements();
33052 unsigned NumSrcElts = NumDstElts / 2;
33053 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
33054 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
33055 bool IsSigned = (X86ISD::PACKSS == Opcode);
33057 APInt Undefs(NumDstElts, 0);
33058 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
33059 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
33060 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
33061 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
33062 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
33063 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
33065 if (UndefElts[SrcIdx]) {
33066 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
33070 APInt &Val = EltBits[SrcIdx];
33072 // PACKSS: Truncate signed value with signed saturation.
33073 // Source values less than dst minint are saturated to minint.
33074 // Source values greater than dst maxint are saturated to maxint.
33075 if (Val.isSignedIntN(DstBitsPerElt))
33076 Val = Val.trunc(DstBitsPerElt);
33077 else if (Val.isNegative())
33078 Val = APInt::getSignedMinValue(DstBitsPerElt);
33080 Val = APInt::getSignedMaxValue(DstBitsPerElt);
33082 // PACKUS: Truncate signed value with unsigned saturation.
33083 // Source values less than zero are saturated to zero.
33084 // Source values greater than dst maxuint are saturated to maxuint.
33085 if (Val.isIntN(DstBitsPerElt))
33086 Val = Val.trunc(DstBitsPerElt);
33087 else if (Val.isNegative())
33088 Val = APInt::getNullValue(DstBitsPerElt);
33090 Val = APInt::getAllOnesValue(DstBitsPerElt);
33092 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
33096 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
33099 // Attempt to combine as shuffle.
33101 if (SDValue Res = combineX86ShufflesRecursively(
33102 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33103 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33104 DCI.CombineTo(N, Res);
33111 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
33112 TargetLowering::DAGCombinerInfo &DCI,
33113 const X86Subtarget &Subtarget) {
33114 unsigned Opcode = N->getOpcode();
33115 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
33116 X86ISD::VSRLI == Opcode) &&
33117 "Unexpected shift opcode");
33118 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
33119 EVT VT = N->getValueType(0);
33120 SDValue N0 = N->getOperand(0);
33121 SDValue N1 = N->getOperand(1);
33122 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
33123 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
33124 "Unexpected value type");
33126 // Out of range logical bit shifts are guaranteed to be zero.
33127 // Out of range arithmetic bit shifts splat the sign bit.
33128 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
33129 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
33131 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
33133 ShiftVal = NumBitsPerElt - 1;
33136 // Shift N0 by zero -> N0.
33140 // Shift zero -> zero.
33141 if (ISD::isBuildVectorAllZeros(N0.getNode()))
33142 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
33144 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
33145 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
33146 // TODO - support other sra opcodes as needed.
33147 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
33148 N0.getOpcode() == X86ISD::VSRAI)
33149 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
33151 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
33152 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
33153 N1 == N0.getOperand(1)) {
33154 SDValue N00 = N0.getOperand(0);
33155 unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
33156 if (ShiftVal.ult(NumSignBits))
33160 // We can decode 'whole byte' logical bit shifts as shuffles.
33161 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
33163 if (SDValue Res = combineX86ShufflesRecursively(
33164 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33165 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33166 DCI.CombineTo(N, Res);
33171 // Constant Folding.
33173 SmallVector<APInt, 32> EltBits;
33174 if (N->isOnlyUserOf(N0.getNode()) &&
33175 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
33176 assert(EltBits.size() == VT.getVectorNumElements() &&
33177 "Unexpected shift value type");
33178 unsigned ShiftImm = ShiftVal.getZExtValue();
33179 for (APInt &Elt : EltBits) {
33180 if (X86ISD::VSHLI == Opcode)
33182 else if (X86ISD::VSRAI == Opcode)
33183 Elt.ashrInPlace(ShiftImm);
33185 Elt.lshrInPlace(ShiftImm);
33187 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
33193 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
33194 TargetLowering::DAGCombinerInfo &DCI,
33195 const X86Subtarget &Subtarget) {
33197 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
33198 (N->getOpcode() == X86ISD::PINSRW &&
33199 N->getValueType(0) == MVT::v8i16)) &&
33200 "Unexpected vector insertion");
33202 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
33204 if (SDValue Res = combineX86ShufflesRecursively(
33205 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33206 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33207 DCI.CombineTo(N, Res);
33214 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
33215 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
33216 /// OR -> CMPNEQSS.
33217 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
33218 TargetLowering::DAGCombinerInfo &DCI,
33219 const X86Subtarget &Subtarget) {
33222 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
33223 // we're requiring SSE2 for both.
33224 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
33225 SDValue N0 = N->getOperand(0);
33226 SDValue N1 = N->getOperand(1);
33227 SDValue CMP0 = N0->getOperand(1);
33228 SDValue CMP1 = N1->getOperand(1);
33231 // The SETCCs should both refer to the same CMP.
33232 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
33235 SDValue CMP00 = CMP0->getOperand(0);
33236 SDValue CMP01 = CMP0->getOperand(1);
33237 EVT VT = CMP00.getValueType();
33239 if (VT == MVT::f32 || VT == MVT::f64) {
33240 bool ExpectingFlags = false;
33241 // Check for any users that want flags:
33242 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
33243 !ExpectingFlags && UI != UE; ++UI)
33244 switch (UI->getOpcode()) {
33249 ExpectingFlags = true;
33251 case ISD::CopyToReg:
33252 case ISD::SIGN_EXTEND:
33253 case ISD::ZERO_EXTEND:
33254 case ISD::ANY_EXTEND:
33258 if (!ExpectingFlags) {
33259 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
33260 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
33262 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
33263 X86::CondCode tmp = cc0;
33268 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
33269 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
33270 // FIXME: need symbolic constants for these magic numbers.
33271 // See X86ATTInstPrinter.cpp:printSSECC().
33272 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
33273 if (Subtarget.hasAVX512()) {
33275 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
33276 DAG.getConstant(x86cc, DL, MVT::i8));
33277 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
33278 N->getSimpleValueType(0), FSetCC,
33279 DAG.getIntPtrConstant(0, DL));
33281 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
33282 CMP00.getValueType(), CMP00, CMP01,
33283 DAG.getConstant(x86cc, DL,
33286 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
33287 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
33289 if (is64BitFP && !Subtarget.is64Bit()) {
33290 // On a 32-bit target, we cannot bitcast the 64-bit float to a
33291 // 64-bit integer, since that's not a legal type. Since
33292 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
33293 // bits, but can do this little dance to extract the lowest 32 bits
33294 // and work with those going forward.
33295 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
33297 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
33298 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
33299 Vector32, DAG.getIntPtrConstant(0, DL));
33303 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
33304 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
33305 DAG.getConstant(1, DL, IntVT));
33306 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
33308 return OneBitOfTruth;
33316 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
33317 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
33318 assert(N->getOpcode() == ISD::AND);
33320 EVT VT = N->getValueType(0);
33321 SDValue N0 = N->getOperand(0);
33322 SDValue N1 = N->getOperand(1);
33325 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
33328 if (N0.getOpcode() == ISD::XOR &&
33329 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
33330 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
33332 if (N1.getOpcode() == ISD::XOR &&
33333 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
33334 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
33339 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
33340 // register. In most cases we actually compare or select YMM-sized registers
33341 // and mixing the two types creates horrible code. This method optimizes
33342 // some of the transition sequences.
33343 // Even with AVX-512 this is still useful for removing casts around logical
33344 // operations on vXi1 mask types.
33345 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
33346 TargetLowering::DAGCombinerInfo &DCI,
33347 const X86Subtarget &Subtarget) {
33348 EVT VT = N->getValueType(0);
33349 assert(VT.isVector() && "Expected vector type");
33351 assert((N->getOpcode() == ISD::ANY_EXTEND ||
33352 N->getOpcode() == ISD::ZERO_EXTEND ||
33353 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
33355 SDValue Narrow = N->getOperand(0);
33356 EVT NarrowVT = Narrow.getValueType();
33358 if (Narrow->getOpcode() != ISD::XOR &&
33359 Narrow->getOpcode() != ISD::AND &&
33360 Narrow->getOpcode() != ISD::OR)
33363 SDValue N0 = Narrow->getOperand(0);
33364 SDValue N1 = Narrow->getOperand(1);
33367 // The Left side has to be a trunc.
33368 if (N0.getOpcode() != ISD::TRUNCATE)
33371 // The type of the truncated inputs.
33372 if (N0->getOperand(0).getValueType() != VT)
33375 // The right side has to be a 'trunc' or a constant vector.
33376 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
33377 N1.getOperand(0).getValueType() == VT;
33379 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
33382 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33384 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
33387 // Set N0 and N1 to hold the inputs to the new wide operation.
33388 N0 = N0->getOperand(0);
33390 N1 = N1->getOperand(0);
33392 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
33394 // Generate the wide operation.
33395 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
33396 unsigned Opcode = N->getOpcode();
33398 default: llvm_unreachable("Unexpected opcode");
33399 case ISD::ANY_EXTEND:
33401 case ISD::ZERO_EXTEND:
33402 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
33403 case ISD::SIGN_EXTEND:
33404 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
33405 Op, DAG.getValueType(NarrowVT));
33409 /// If both input operands of a logic op are being cast from floating point
33410 /// types, try to convert this into a floating point logic node to avoid
33411 /// unnecessary moves from SSE to integer registers.
33412 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
33413 const X86Subtarget &Subtarget) {
33414 unsigned FPOpcode = ISD::DELETED_NODE;
33415 if (N->getOpcode() == ISD::AND)
33416 FPOpcode = X86ISD::FAND;
33417 else if (N->getOpcode() == ISD::OR)
33418 FPOpcode = X86ISD::FOR;
33419 else if (N->getOpcode() == ISD::XOR)
33420 FPOpcode = X86ISD::FXOR;
33422 assert(FPOpcode != ISD::DELETED_NODE &&
33423 "Unexpected input node for FP logic conversion");
33425 EVT VT = N->getValueType(0);
33426 SDValue N0 = N->getOperand(0);
33427 SDValue N1 = N->getOperand(1);
33429 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
33430 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
33431 (Subtarget.hasSSE2() && VT == MVT::i64))) {
33432 SDValue N00 = N0.getOperand(0);
33433 SDValue N10 = N1.getOperand(0);
33434 EVT N00Type = N00.getValueType();
33435 EVT N10Type = N10.getValueType();
33436 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
33437 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
33438 return DAG.getBitcast(VT, FPLogic);
33444 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
33445 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
33446 /// with a shift-right to eliminate loading the vector constant mask value.
33447 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
33448 const X86Subtarget &Subtarget) {
33449 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
33450 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
33451 EVT VT0 = Op0.getValueType();
33452 EVT VT1 = Op1.getValueType();
33454 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
33458 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
33459 !SplatVal.isMask())
33462 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
33465 unsigned EltBitWidth = VT0.getScalarSizeInBits();
33466 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
33470 unsigned ShiftVal = SplatVal.countTrailingOnes();
33471 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
33472 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
33473 return DAG.getBitcast(N->getValueType(0), Shift);
33476 // Get the index node from the lowered DAG of a GEP IR instruction with one
33477 // indexing dimension.
33478 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
33479 if (Ld->isIndexed())
33482 SDValue Base = Ld->getBasePtr();
33484 if (Base.getOpcode() != ISD::ADD)
33487 SDValue ShiftedIndex = Base.getOperand(0);
33489 if (ShiftedIndex.getOpcode() != ISD::SHL)
33492 return ShiftedIndex.getOperand(0);
33496 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
33497 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
33498 switch (VT.getSizeInBits()) {
33499 default: return false;
33500 case 64: return Subtarget.is64Bit() ? true : false;
33501 case 32: return true;
33507 // This function recognizes cases where X86 bzhi instruction can replace and
33508 // 'and-load' sequence.
33509 // In case of loading integer value from an array of constants which is defined
33512 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
33514 // then applying a bitwise and on the result with another input.
33515 // It's equivalent to performing bzhi (zero high bits) on the input, with the
33516 // same index of the load.
33517 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
33518 const X86Subtarget &Subtarget) {
33519 MVT VT = Node->getSimpleValueType(0);
33522 // Check if subtarget has BZHI instruction for the node's type
33523 if (!hasBZHI(Subtarget, VT))
33526 // Try matching the pattern for both operands.
33527 for (unsigned i = 0; i < 2; i++) {
33528 SDValue N = Node->getOperand(i);
33529 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
33531 // continue if the operand is not a load instruction
33535 const Value *MemOp = Ld->getMemOperand()->getValue();
33540 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
33541 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
33542 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
33544 Constant *Init = GV->getInitializer();
33545 Type *Ty = Init->getType();
33546 if (!isa<ConstantDataArray>(Init) ||
33547 !Ty->getArrayElementType()->isIntegerTy() ||
33548 Ty->getArrayElementType()->getScalarSizeInBits() !=
33549 VT.getSizeInBits() ||
33550 Ty->getArrayNumElements() >
33551 Ty->getArrayElementType()->getScalarSizeInBits())
33554 // Check if the array's constant elements are suitable to our case.
33555 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
33556 bool ConstantsMatch = true;
33557 for (uint64_t j = 0; j < ArrayElementCount; j++) {
33558 ConstantInt *Elem =
33559 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
33560 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
33561 ConstantsMatch = false;
33565 if (!ConstantsMatch)
33568 // Do the transformation (For 32-bit type):
33569 // -> (and (load arr[idx]), inp)
33570 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
33571 // that will be replaced with one bzhi instruction.
33572 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
33573 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
33575 // Get the Node which indexes into the array.
33576 SDValue Index = getIndexFromUnindexedLoad(Ld);
33579 Index = DAG.getZExtOrTrunc(Index, dl, VT);
33581 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
33583 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
33584 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
33586 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
33594 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
33595 TargetLowering::DAGCombinerInfo &DCI,
33596 const X86Subtarget &Subtarget) {
33597 EVT VT = N->getValueType(0);
33599 // If this is SSE1 only convert to FAND to avoid scalarization.
33600 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
33601 return DAG.getBitcast(
33602 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
33603 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
33604 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
33607 if (DCI.isBeforeLegalizeOps())
33610 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
33613 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33616 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
33619 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
33622 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
33625 // Attempt to recursively combine a bitmask AND with shuffles.
33626 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33628 if (SDValue Res = combineX86ShufflesRecursively(
33629 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33630 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33631 DCI.CombineTo(N, Res);
33636 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
33637 if ((VT.getScalarSizeInBits() % 8) == 0 &&
33638 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
33639 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
33640 SDValue BitMask = N->getOperand(1);
33641 SDValue SrcVec = N->getOperand(0).getOperand(0);
33642 EVT SrcVecVT = SrcVec.getValueType();
33644 // Check that the constant bitmask masks whole bytes.
33646 SmallVector<APInt, 64> EltBits;
33647 if (VT == SrcVecVT.getScalarType() &&
33648 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
33649 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
33650 llvm::all_of(EltBits, [](APInt M) {
33651 return M.isNullValue() || M.isAllOnesValue();
33653 unsigned NumElts = SrcVecVT.getVectorNumElements();
33654 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
33655 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
33657 // Create a root shuffle mask from the byte mask and the extracted index.
33658 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
33659 for (unsigned i = 0; i != Scale; ++i) {
33662 int VecIdx = Scale * Idx + i;
33663 ShuffleMask[VecIdx] =
33664 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
33667 if (SDValue Shuffle = combineX86ShufflesRecursively(
33668 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
33669 /*HasVarMask*/ false, DAG, DCI, Subtarget))
33670 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
33671 N->getOperand(0).getOperand(1));
33679 // (or (and (m, y), (pandn m, x)))
33681 // (vselect m, x, y)
33682 // As a special case, try to fold:
33683 // (or (and (m, (sub 0, x)), (pandn m, x)))
33685 // (sub (xor X, M), M)
33686 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
33687 const X86Subtarget &Subtarget) {
33688 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
33690 SDValue N0 = N->getOperand(0);
33691 SDValue N1 = N->getOperand(1);
33692 EVT VT = N->getValueType(0);
33694 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
33695 (VT.is256BitVector() && Subtarget.hasInt256())))
33698 // Canonicalize AND to LHS.
33699 if (N1.getOpcode() == ISD::AND)
33702 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
33703 // ANDNP combine allows other combines to happen that prevent matching.
33704 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
33707 SDValue Mask = N1.getOperand(0);
33708 SDValue X = N1.getOperand(1);
33710 if (N0.getOperand(0) == Mask)
33711 Y = N0.getOperand(1);
33712 if (N0.getOperand(1) == Mask)
33713 Y = N0.getOperand(0);
33715 // Check to see if the mask appeared in both the AND and ANDNP.
33719 // Validate that X, Y, and Mask are bitcasts, and see through them.
33720 Mask = peekThroughBitcasts(Mask);
33721 X = peekThroughBitcasts(X);
33722 Y = peekThroughBitcasts(Y);
33724 EVT MaskVT = Mask.getValueType();
33725 unsigned EltBits = MaskVT.getScalarSizeInBits();
33727 // TODO: Attempt to handle floating point cases as well?
33728 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
33734 // (or (and (M, (sub 0, X)), (pandn M, X)))
33735 // which is a special case of vselect:
33736 // (vselect M, (sub 0, X), X)
33738 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
33739 // We know that, if fNegate is 0 or 1:
33740 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
33742 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
33743 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
33744 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
33745 // This lets us transform our vselect to:
33746 // (add (xor X, M), (and M, 1))
33748 // (sub (xor X, M), M)
33749 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
33750 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
33751 auto IsNegV = [](SDNode *N, SDValue V) {
33752 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
33753 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
33756 if (IsNegV(Y.getNode(), X))
33758 else if (IsNegV(X.getNode(), Y))
33762 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
33763 SDValue SubOp2 = Mask;
33765 // If the negate was on the false side of the select, then
33766 // the operands of the SUB need to be swapped. PR 27251.
33767 // This is because the pattern being matched above is
33768 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
33769 // but if the pattern matched was
33770 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
33771 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
33772 // pattern also needs to be a negation of the replacement pattern above.
33773 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
33774 // sub accomplishes the negation of the replacement pattern.
33776 std::swap(SubOp1, SubOp2);
33778 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
33779 return DAG.getBitcast(VT, Res);
33783 // PBLENDVB is only available on SSE 4.1.
33784 if (!Subtarget.hasSSE41())
33787 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
33789 X = DAG.getBitcast(BlendVT, X);
33790 Y = DAG.getBitcast(BlendVT, Y);
33791 Mask = DAG.getBitcast(BlendVT, Mask);
33792 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
33793 return DAG.getBitcast(VT, Mask);
33796 // Helper function for combineOrCmpEqZeroToCtlzSrl
33800 // srl(ctlz x), log2(bitsize(x))
33801 // Input pattern is checked by caller.
33802 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
33803 SelectionDAG &DAG) {
33804 SDValue Cmp = Op.getOperand(1);
33805 EVT VT = Cmp.getOperand(0).getValueType();
33806 unsigned Log2b = Log2_32(VT.getSizeInBits());
33808 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
33809 // The result of the shift is true or false, and on X86, the 32-bit
33810 // encoding of shr and lzcnt is more desirable.
33811 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
33812 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
33813 DAG.getConstant(Log2b, dl, VT));
33814 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
33817 // Try to transform:
33818 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
33820 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
33821 // Will also attempt to match more generic cases, eg:
33822 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
33823 // Only applies if the target supports the FastLZCNT feature.
33824 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
33825 TargetLowering::DAGCombinerInfo &DCI,
33826 const X86Subtarget &Subtarget) {
33827 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
33830 auto isORCandidate = [](SDValue N) {
33831 return (N->getOpcode() == ISD::OR && N->hasOneUse());
33834 // Check the zero extend is extending to 32-bit or more. The code generated by
33835 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
33836 // instructions to clear the upper bits.
33837 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
33838 !isORCandidate(N->getOperand(0)))
33841 // Check the node matches: setcc(eq, cmp 0)
33842 auto isSetCCCandidate = [](SDValue N) {
33843 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
33844 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
33845 N->getOperand(1).getOpcode() == X86ISD::CMP &&
33846 isNullConstant(N->getOperand(1).getOperand(1)) &&
33847 N->getOperand(1).getValueType().bitsGE(MVT::i32);
33850 SDNode *OR = N->getOperand(0).getNode();
33851 SDValue LHS = OR->getOperand(0);
33852 SDValue RHS = OR->getOperand(1);
33854 // Save nodes matching or(or, setcc(eq, cmp 0)).
33855 SmallVector<SDNode *, 2> ORNodes;
33856 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
33857 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
33858 ORNodes.push_back(OR);
33859 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
33860 LHS = OR->getOperand(0);
33861 RHS = OR->getOperand(1);
33864 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
33865 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
33866 !isORCandidate(SDValue(OR, 0)))
33869 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
33871 // or(srl(ctlz),srl(ctlz)).
33872 // The dag combiner can then fold it into:
33873 // srl(or(ctlz, ctlz)).
33874 EVT VT = OR->getValueType(0);
33875 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
33876 SDValue Ret, NewRHS;
33877 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
33878 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
33883 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
33884 while (ORNodes.size() > 0) {
33885 OR = ORNodes.pop_back_val();
33886 LHS = OR->getOperand(0);
33887 RHS = OR->getOperand(1);
33888 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
33889 if (RHS->getOpcode() == ISD::OR)
33890 std::swap(LHS, RHS);
33891 EVT VT = OR->getValueType(0);
33892 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
33895 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
33899 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
33904 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
33905 TargetLowering::DAGCombinerInfo &DCI,
33906 const X86Subtarget &Subtarget) {
33907 SDValue N0 = N->getOperand(0);
33908 SDValue N1 = N->getOperand(1);
33909 EVT VT = N->getValueType(0);
33911 // If this is SSE1 only convert to FOR to avoid scalarization.
33912 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
33913 return DAG.getBitcast(MVT::v4i32,
33914 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
33915 DAG.getBitcast(MVT::v4f32, N0),
33916 DAG.getBitcast(MVT::v4f32, N1)));
33919 if (DCI.isBeforeLegalizeOps())
33922 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
33925 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33928 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
33931 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
33934 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
33935 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
33937 // SHLD/SHRD instructions have lower register pressure, but on some
33938 // platforms they have higher latency than the equivalent
33939 // series of shifts/or that would otherwise be generated.
33940 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
33941 // have higher latencies and we are not optimizing for size.
33942 if (!OptForSize && Subtarget.isSHLDSlow())
33945 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
33947 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
33949 if (!N0.hasOneUse() || !N1.hasOneUse())
33952 SDValue ShAmt0 = N0.getOperand(1);
33953 if (ShAmt0.getValueType() != MVT::i8)
33955 SDValue ShAmt1 = N1.getOperand(1);
33956 if (ShAmt1.getValueType() != MVT::i8)
33958 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
33959 ShAmt0 = ShAmt0.getOperand(0);
33960 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
33961 ShAmt1 = ShAmt1.getOperand(0);
33964 unsigned Opc = X86ISD::SHLD;
33965 SDValue Op0 = N0.getOperand(0);
33966 SDValue Op1 = N1.getOperand(0);
33967 if (ShAmt0.getOpcode() == ISD::SUB ||
33968 ShAmt0.getOpcode() == ISD::XOR) {
33969 Opc = X86ISD::SHRD;
33970 std::swap(Op0, Op1);
33971 std::swap(ShAmt0, ShAmt1);
33974 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
33975 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
33976 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
33977 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
33978 unsigned Bits = VT.getSizeInBits();
33979 if (ShAmt1.getOpcode() == ISD::SUB) {
33980 SDValue Sum = ShAmt1.getOperand(0);
33981 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
33982 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
33983 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
33984 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
33985 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
33986 return DAG.getNode(Opc, DL, VT,
33988 DAG.getNode(ISD::TRUNCATE, DL,
33991 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
33992 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
33993 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
33994 return DAG.getNode(Opc, DL, VT,
33995 N0.getOperand(0), N1.getOperand(0),
33996 DAG.getNode(ISD::TRUNCATE, DL,
33998 } else if (ShAmt1.getOpcode() == ISD::XOR) {
33999 SDValue Mask = ShAmt1.getOperand(1);
34000 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
34001 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
34002 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
34003 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
34004 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
34005 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
34006 if (Op1.getOpcode() == InnerShift &&
34007 isa<ConstantSDNode>(Op1.getOperand(1)) &&
34008 Op1.getConstantOperandVal(1) == 1) {
34009 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
34010 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
34012 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
34013 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
34014 Op1.getOperand(0) == Op1.getOperand(1)) {
34015 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
34016 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
34025 /// Try to turn tests against the signbit in the form of:
34026 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
34029 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
34030 // This is only worth doing if the output type is i8 or i1.
34031 EVT ResultType = N->getValueType(0);
34032 if (ResultType != MVT::i8 && ResultType != MVT::i1)
34035 SDValue N0 = N->getOperand(0);
34036 SDValue N1 = N->getOperand(1);
34038 // We should be performing an xor against a truncated shift.
34039 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
34042 // Make sure we are performing an xor against one.
34043 if (!isOneConstant(N1))
34046 // SetCC on x86 zero extends so only act on this if it's a logical shift.
34047 SDValue Shift = N0.getOperand(0);
34048 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
34051 // Make sure we are truncating from one of i16, i32 or i64.
34052 EVT ShiftTy = Shift.getValueType();
34053 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
34056 // Make sure the shift amount extracts the sign bit.
34057 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
34058 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
34061 // Create a greater-than comparison against -1.
34062 // N.B. Using SETGE against 0 works but we want a canonical looking
34063 // comparison, using SETGT matches up with what TranslateX86CC.
34065 SDValue ShiftOp = Shift.getOperand(0);
34066 EVT ShiftOpTy = ShiftOp.getValueType();
34067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34068 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
34069 *DAG.getContext(), ResultType);
34070 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
34071 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
34072 if (SetCCResultType != ResultType)
34073 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
34077 /// Turn vector tests of the signbit in the form of:
34078 /// xor (sra X, elt_size(X)-1), -1
34082 /// This should be called before type legalization because the pattern may not
34083 /// persist after that.
34084 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
34085 const X86Subtarget &Subtarget) {
34086 EVT VT = N->getValueType(0);
34087 if (!VT.isSimple())
34090 switch (VT.getSimpleVT().SimpleTy) {
34091 default: return SDValue();
34094 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
34095 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
34099 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
34102 // There must be a shift right algebraic before the xor, and the xor must be a
34103 // 'not' operation.
34104 SDValue Shift = N->getOperand(0);
34105 SDValue Ones = N->getOperand(1);
34106 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
34107 !ISD::isBuildVectorAllOnes(Ones.getNode()))
34110 // The shift should be smearing the sign bit across each vector element.
34111 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
34115 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
34116 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
34117 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
34120 // Create a greater-than comparison against -1. We don't use the more obvious
34121 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
34122 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
34125 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
34126 /// is valid for the given \p Subtarget.
34127 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
34128 const X86Subtarget &Subtarget) {
34129 if (!Subtarget.hasAVX512())
34132 // FIXME: Scalar type may be supported if we move it to vector register.
34133 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
34136 EVT SrcElVT = SrcVT.getScalarType();
34137 EVT DstElVT = DstVT.getScalarType();
34138 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
34140 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
34142 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
34143 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
34147 /// Detect a pattern of truncation with saturation:
34148 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
34149 /// Return the source value to be truncated or SDValue() if the pattern was not
34151 static SDValue detectUSatPattern(SDValue In, EVT VT) {
34152 if (In.getOpcode() != ISD::UMIN)
34155 //Saturation with truncation. We truncate from InVT to VT.
34156 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
34157 "Unexpected types for truncate operation");
34160 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
34161 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
34162 // the element size of the destination type.
34163 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
34169 /// Detect a pattern of truncation with saturation:
34170 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
34171 /// The types should allow to use VPMOVUS* instruction on AVX512.
34172 /// Return the source value to be truncated or SDValue() if the pattern was not
34174 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
34175 const X86Subtarget &Subtarget) {
34176 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
34178 return detectUSatPattern(In, VT);
34182 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
34183 const X86Subtarget &Subtarget) {
34184 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34185 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
34187 if (auto USatVal = detectUSatPattern(In, VT))
34188 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
34189 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
34193 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
34194 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
34195 /// X86ISD::AVG instruction.
34196 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
34197 const X86Subtarget &Subtarget,
34199 if (!VT.isVector() || !VT.isSimple())
34201 EVT InVT = In.getValueType();
34202 unsigned NumElems = VT.getVectorNumElements();
34204 EVT ScalarVT = VT.getVectorElementType();
34205 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
34206 isPowerOf2_32(NumElems)))
34209 // InScalarVT is the intermediate type in AVG pattern and it should be greater
34210 // than the original input type (i8/i16).
34211 EVT InScalarVT = InVT.getVectorElementType();
34212 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
34215 if (!Subtarget.hasSSE2())
34218 // Detect the following pattern:
34220 // %1 = zext <N x i8> %a to <N x i32>
34221 // %2 = zext <N x i8> %b to <N x i32>
34222 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
34223 // %4 = add nuw nsw <N x i32> %3, %2
34224 // %5 = lshr <N x i32> %N, <i32 1 x N>
34225 // %6 = trunc <N x i32> %5 to <N x i8>
34227 // In AVX512, the last instruction can also be a trunc store.
34228 if (In.getOpcode() != ISD::SRL)
34231 // A lambda checking the given SDValue is a constant vector and each element
34232 // is in the range [Min, Max].
34233 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
34234 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
34235 if (!BV || !BV->isConstant())
34237 for (SDValue Op : V->ops()) {
34238 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
34241 uint64_t Val = C->getZExtValue();
34242 if (Val < Min || Val > Max)
34248 // Split vectors to legal target size and apply AVG.
34249 auto LowerToAVG = [&](SDValue Op0, SDValue Op1) {
34250 unsigned NumSubs = 1;
34251 if (Subtarget.hasBWI()) {
34252 if (VT.getSizeInBits() > 512)
34253 NumSubs = VT.getSizeInBits() / 512;
34254 } else if (Subtarget.hasAVX2()) {
34255 if (VT.getSizeInBits() > 256)
34256 NumSubs = VT.getSizeInBits() / 256;
34258 if (VT.getSizeInBits() > 128)
34259 NumSubs = VT.getSizeInBits() / 128;
34263 return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1);
34265 SmallVector<SDValue, 4> Subs;
34266 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
34267 VT.getVectorNumElements() / NumSubs);
34268 for (unsigned i = 0; i != NumSubs; ++i) {
34269 unsigned Idx = i * SubVT.getVectorNumElements();
34270 SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
34271 SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
34272 Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS));
34274 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
34277 // Check if each element of the vector is left-shifted by one.
34278 auto LHS = In.getOperand(0);
34279 auto RHS = In.getOperand(1);
34280 if (!IsConstVectorInRange(RHS, 1, 1))
34282 if (LHS.getOpcode() != ISD::ADD)
34285 // Detect a pattern of a + b + 1 where the order doesn't matter.
34286 SDValue Operands[3];
34287 Operands[0] = LHS.getOperand(0);
34288 Operands[1] = LHS.getOperand(1);
34290 // Take care of the case when one of the operands is a constant vector whose
34291 // element is in the range [1, 256].
34292 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
34293 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
34294 Operands[0].getOperand(0).getValueType() == VT) {
34295 // The pattern is detected. Subtract one from the constant vector, then
34296 // demote it and emit X86ISD::AVG instruction.
34297 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
34298 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
34299 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
34300 return LowerToAVG(Operands[0].getOperand(0), Operands[1]);
34303 if (Operands[0].getOpcode() == ISD::ADD)
34304 std::swap(Operands[0], Operands[1]);
34305 else if (Operands[1].getOpcode() != ISD::ADD)
34307 Operands[2] = Operands[1].getOperand(0);
34308 Operands[1] = Operands[1].getOperand(1);
34310 // Now we have three operands of two additions. Check that one of them is a
34311 // constant vector with ones, and the other two are promoted from i8/i16.
34312 for (int i = 0; i < 3; ++i) {
34313 if (!IsConstVectorInRange(Operands[i], 1, 1))
34315 std::swap(Operands[i], Operands[2]);
34317 // Check if Operands[0] and Operands[1] are results of type promotion.
34318 for (int j = 0; j < 2; ++j)
34319 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
34320 Operands[j].getOperand(0).getValueType() != VT)
34323 // The pattern is detected, emit X86ISD::AVG instruction.
34324 return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0));
34330 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
34331 TargetLowering::DAGCombinerInfo &DCI,
34332 const X86Subtarget &Subtarget) {
34333 LoadSDNode *Ld = cast<LoadSDNode>(N);
34334 EVT RegVT = Ld->getValueType(0);
34335 EVT MemVT = Ld->getMemoryVT();
34337 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34339 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
34340 // into two 16-byte operations. Also split non-temporal aligned loads on
34341 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
34342 ISD::LoadExtType Ext = Ld->getExtensionType();
34344 unsigned AddressSpace = Ld->getAddressSpace();
34345 unsigned Alignment = Ld->getAlignment();
34346 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
34347 Ext == ISD::NON_EXTLOAD &&
34348 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
34349 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
34350 AddressSpace, Alignment, &Fast) && !Fast))) {
34351 unsigned NumElems = RegVT.getVectorNumElements();
34355 SDValue Ptr = Ld->getBasePtr();
34357 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
34360 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
34361 Alignment, Ld->getMemOperand()->getFlags());
34363 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
34365 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
34366 Ld->getPointerInfo().getWithOffset(16),
34367 MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
34368 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34370 Load2.getValue(1));
34372 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
34373 return DCI.CombineTo(N, NewVec, TF, true);
34379 /// If V is a build vector of boolean constants and exactly one of those
34380 /// constants is true, return the operand index of that true element.
34381 /// Otherwise, return -1.
34382 static int getOneTrueElt(SDValue V) {
34383 // This needs to be a build vector of booleans.
34384 // TODO: Checking for the i1 type matches the IR definition for the mask,
34385 // but the mask check could be loosened to i8 or other types. That might
34386 // also require checking more than 'allOnesValue'; eg, the x86 HW
34387 // instructions only require that the MSB is set for each mask element.
34388 // The ISD::MSTORE comments/definition do not specify how the mask operand
34390 auto *BV = dyn_cast<BuildVectorSDNode>(V);
34391 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
34394 int TrueIndex = -1;
34395 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
34396 for (unsigned i = 0; i < NumElts; ++i) {
34397 const SDValue &Op = BV->getOperand(i);
34400 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
34403 if (ConstNode->getAPIntValue().isAllOnesValue()) {
34404 // If we already found a one, this is too many.
34405 if (TrueIndex >= 0)
34413 /// Given a masked memory load/store operation, return true if it has one mask
34414 /// bit set. If it has one mask bit set, then also return the memory address of
34415 /// the scalar element to load/store, the vector index to insert/extract that
34416 /// scalar element, and the alignment for the scalar memory access.
34417 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
34418 SelectionDAG &DAG, SDValue &Addr,
34419 SDValue &Index, unsigned &Alignment) {
34420 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
34421 if (TrueMaskElt < 0)
34424 // Get the address of the one scalar element that is specified by the mask
34425 // using the appropriate offset from the base pointer.
34426 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
34427 Addr = MaskedOp->getBasePtr();
34428 if (TrueMaskElt != 0) {
34429 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
34430 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
34433 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
34434 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
34438 /// If exactly one element of the mask is set for a non-extending masked load,
34439 /// it is a scalar load and vector insert.
34440 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
34441 /// mask have already been optimized in IR, so we don't bother with those here.
34443 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
34444 TargetLowering::DAGCombinerInfo &DCI) {
34445 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
34446 // However, some target hooks may need to be added to know when the transform
34447 // is profitable. Endianness would also have to be considered.
34449 SDValue Addr, VecIndex;
34450 unsigned Alignment;
34451 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
34454 // Load the one scalar element that is specified by the mask using the
34455 // appropriate offset from the base pointer.
34457 EVT VT = ML->getValueType(0);
34458 EVT EltVT = VT.getVectorElementType();
34460 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
34461 Alignment, ML->getMemOperand()->getFlags());
34463 // Insert the loaded element into the appropriate place in the vector.
34464 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
34466 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
34470 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
34471 TargetLowering::DAGCombinerInfo &DCI) {
34472 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
34476 EVT VT = ML->getValueType(0);
34478 // If we are loading the first and last elements of a vector, it is safe and
34479 // always faster to load the whole vector. Replace the masked load with a
34480 // vector load and select.
34481 unsigned NumElts = VT.getVectorNumElements();
34482 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
34483 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
34484 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
34485 if (LoadFirstElt && LoadLastElt) {
34486 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
34487 ML->getMemOperand());
34488 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
34489 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
34492 // Convert a masked load with a constant mask into a masked load and a select.
34493 // This allows the select operation to use a faster kind of select instruction
34494 // (for example, vblendvps -> vblendps).
34496 // Don't try this if the pass-through operand is already undefined. That would
34497 // cause an infinite loop because that's what we're about to create.
34498 if (ML->getSrc0().isUndef())
34501 // The new masked load has an undef pass-through operand. The select uses the
34502 // original pass-through operand.
34503 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
34504 ML->getMask(), DAG.getUNDEF(VT),
34505 ML->getMemoryVT(), ML->getMemOperand(),
34506 ML->getExtensionType());
34507 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
34509 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
34512 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
34513 TargetLowering::DAGCombinerInfo &DCI,
34514 const X86Subtarget &Subtarget) {
34515 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
34517 // TODO: Expanding load with constant mask may be optimized as well.
34518 if (Mld->isExpandingLoad())
34521 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
34522 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
34524 // TODO: Do some AVX512 subsets benefit from this transform?
34525 if (!Subtarget.hasAVX512())
34526 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
34530 if (Mld->getExtensionType() != ISD::SEXTLOAD)
34533 // Resolve extending loads.
34534 EVT VT = Mld->getValueType(0);
34535 unsigned NumElems = VT.getVectorNumElements();
34536 EVT LdVT = Mld->getMemoryVT();
34539 assert(LdVT != VT && "Cannot extend to the same type");
34540 unsigned ToSz = VT.getScalarSizeInBits();
34541 unsigned FromSz = LdVT.getScalarSizeInBits();
34542 // From/To sizes and ElemCount must be pow of two.
34543 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
34544 "Unexpected size for extending masked load");
34546 unsigned SizeRatio = ToSz / FromSz;
34547 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
34549 // Create a type on which we perform the shuffle.
34550 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34551 LdVT.getScalarType(), NumElems*SizeRatio);
34552 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34554 // Convert Src0 value.
34555 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
34556 if (!Mld->getSrc0().isUndef()) {
34557 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34558 for (unsigned i = 0; i != NumElems; ++i)
34559 ShuffleVec[i] = i * SizeRatio;
34561 // Can't shuffle using an illegal type.
34562 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
34563 "WideVecVT should be legal");
34564 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
34565 DAG.getUNDEF(WideVecVT), ShuffleVec);
34568 // Prepare the new mask.
34570 SDValue Mask = Mld->getMask();
34571 if (Mask.getValueType() == VT) {
34572 // Mask and original value have the same type.
34573 NewMask = DAG.getBitcast(WideVecVT, Mask);
34574 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34575 for (unsigned i = 0; i != NumElems; ++i)
34576 ShuffleVec[i] = i * SizeRatio;
34577 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
34578 ShuffleVec[i] = NumElems * SizeRatio;
34579 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
34580 DAG.getConstant(0, dl, WideVecVT),
34583 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
34584 unsigned WidenNumElts = NumElems*SizeRatio;
34585 unsigned MaskNumElts = VT.getVectorNumElements();
34586 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
34589 unsigned NumConcat = WidenNumElts / MaskNumElts;
34590 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
34591 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
34593 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
34596 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
34597 Mld->getBasePtr(), NewMask, WideSrc0,
34598 Mld->getMemoryVT(), Mld->getMemOperand(),
34600 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
34601 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
34604 /// If exactly one element of the mask is set for a non-truncating masked store,
34605 /// it is a vector extract and scalar store.
34606 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
34607 /// mask have already been optimized in IR, so we don't bother with those here.
34608 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
34609 SelectionDAG &DAG) {
34610 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
34611 // However, some target hooks may need to be added to know when the transform
34612 // is profitable. Endianness would also have to be considered.
34614 SDValue Addr, VecIndex;
34615 unsigned Alignment;
34616 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
34619 // Extract the one scalar element that is actually being stored.
34621 EVT VT = MS->getValue().getValueType();
34622 EVT EltVT = VT.getVectorElementType();
34623 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
34624 MS->getValue(), VecIndex);
34626 // Store that element at the appropriate offset from the base pointer.
34627 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
34628 Alignment, MS->getMemOperand()->getFlags());
34631 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
34632 const X86Subtarget &Subtarget) {
34633 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
34635 if (Mst->isCompressingStore())
34638 if (!Mst->isTruncatingStore()) {
34639 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
34640 return ScalarStore;
34642 // If the mask is checking (0 > X), we're creating a vector with all-zeros
34643 // or all-ones elements based on the sign bits of X. AVX1 masked store only
34644 // cares about the sign bit of each mask element, so eliminate the compare:
34645 // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
34646 // Note that by waiting to match an x86-specific PCMPGT node, we're
34647 // eliminating potentially more complex matching of a setcc node which has
34648 // a full range of predicates.
34649 SDValue Mask = Mst->getMask();
34650 if (Mask.getOpcode() == X86ISD::PCMPGT &&
34651 ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
34652 assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
34653 "Unexpected type for PCMPGT");
34654 return DAG.getMaskedStore(
34655 Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
34656 Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
34659 // TODO: AVX512 targets should also be able to simplify something like the
34660 // pattern above, but that pattern will be different. It will either need to
34661 // match setcc more generally or match PCMPGTM later (in tablegen?).
34666 // Resolve truncating stores.
34667 EVT VT = Mst->getValue().getValueType();
34668 unsigned NumElems = VT.getVectorNumElements();
34669 EVT StVT = Mst->getMemoryVT();
34672 assert(StVT != VT && "Cannot truncate to the same type");
34673 unsigned FromSz = VT.getScalarSizeInBits();
34674 unsigned ToSz = StVT.getScalarSizeInBits();
34676 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34678 // The truncating store is legal in some cases. For example
34679 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
34680 // are designated for truncate store.
34681 // In this case we don't need any further transformations.
34682 if (TLI.isTruncStoreLegal(VT, StVT))
34685 // From/To sizes and ElemCount must be pow of two.
34686 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
34687 "Unexpected size for truncating masked store");
34688 // We are going to use the original vector elt for storing.
34689 // Accumulated smaller vector elements must be a multiple of the store size.
34690 assert (((NumElems * FromSz) % ToSz) == 0 &&
34691 "Unexpected ratio for truncating masked store");
34693 unsigned SizeRatio = FromSz / ToSz;
34694 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
34696 // Create a type on which we perform the shuffle.
34697 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34698 StVT.getScalarType(), NumElems*SizeRatio);
34700 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34702 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
34703 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34704 for (unsigned i = 0; i != NumElems; ++i)
34705 ShuffleVec[i] = i * SizeRatio;
34707 // Can't shuffle using an illegal type.
34708 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
34709 "WideVecVT should be legal");
34711 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
34712 DAG.getUNDEF(WideVecVT),
34716 SDValue Mask = Mst->getMask();
34717 if (Mask.getValueType() == VT) {
34718 // Mask and original value have the same type.
34719 NewMask = DAG.getBitcast(WideVecVT, Mask);
34720 for (unsigned i = 0; i != NumElems; ++i)
34721 ShuffleVec[i] = i * SizeRatio;
34722 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
34723 ShuffleVec[i] = NumElems*SizeRatio;
34724 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
34725 DAG.getConstant(0, dl, WideVecVT),
34728 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
34729 unsigned WidenNumElts = NumElems*SizeRatio;
34730 unsigned MaskNumElts = VT.getVectorNumElements();
34731 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
34734 unsigned NumConcat = WidenNumElts / MaskNumElts;
34735 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
34736 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
34738 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
34741 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
34742 Mst->getBasePtr(), NewMask, StVT,
34743 Mst->getMemOperand(), false);
34746 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
34747 const X86Subtarget &Subtarget) {
34748 StoreSDNode *St = cast<StoreSDNode>(N);
34749 EVT VT = St->getValue().getValueType();
34750 EVT StVT = St->getMemoryVT();
34752 SDValue StoredVal = St->getOperand(1);
34753 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34755 // If we are saving a concatenation of two XMM registers and 32-byte stores
34756 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
34758 unsigned AddressSpace = St->getAddressSpace();
34759 unsigned Alignment = St->getAlignment();
34760 if (VT.is256BitVector() && StVT == VT &&
34761 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
34762 AddressSpace, Alignment, &Fast) &&
34764 unsigned NumElems = VT.getVectorNumElements();
34768 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
34769 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
34771 SDValue Ptr0 = St->getBasePtr();
34772 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
34775 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
34776 Alignment, St->getMemOperand()->getFlags());
34778 DAG.getStore(St->getChain(), dl, Value1, Ptr1,
34779 St->getPointerInfo().getWithOffset(16),
34780 MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
34781 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
34784 // Optimize trunc store (of multiple scalars) to shuffle and store.
34785 // First, pack all of the elements in one place. Next, store to memory
34786 // in fewer chunks.
34787 if (St->isTruncatingStore() && VT.isVector()) {
34788 // Check if we can detect an AVG pattern from the truncation. If yes,
34789 // replace the trunc store by a normal store with the result of X86ISD::AVG
34791 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
34793 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
34794 St->getPointerInfo(), St->getAlignment(),
34795 St->getMemOperand()->getFlags());
34798 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
34799 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
34800 dl, Val, St->getBasePtr(),
34801 St->getMemoryVT(), St->getMemOperand(), DAG);
34803 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34804 unsigned NumElems = VT.getVectorNumElements();
34805 assert(StVT != VT && "Cannot truncate to the same type");
34806 unsigned FromSz = VT.getScalarSizeInBits();
34807 unsigned ToSz = StVT.getScalarSizeInBits();
34809 // The truncating store is legal in some cases. For example
34810 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
34811 // are designated for truncate store.
34812 // In this case we don't need any further transformations.
34813 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
34816 // From, To sizes and ElemCount must be pow of two
34817 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
34818 // We are going to use the original vector elt for storing.
34819 // Accumulated smaller vector elements must be a multiple of the store size.
34820 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
34822 unsigned SizeRatio = FromSz / ToSz;
34824 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
34826 // Create a type on which we perform the shuffle
34827 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34828 StVT.getScalarType(), NumElems*SizeRatio);
34830 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34832 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
34833 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
34834 for (unsigned i = 0; i != NumElems; ++i)
34835 ShuffleVec[i] = i * SizeRatio;
34837 // Can't shuffle using an illegal type.
34838 if (!TLI.isTypeLegal(WideVecVT))
34841 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
34842 DAG.getUNDEF(WideVecVT),
34844 // At this point all of the data is stored at the bottom of the
34845 // register. We now need to save it to mem.
34847 // Find the largest store unit
34848 MVT StoreType = MVT::i8;
34849 for (MVT Tp : MVT::integer_valuetypes()) {
34850 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
34854 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
34855 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
34856 (64 <= NumElems * ToSz))
34857 StoreType = MVT::f64;
34859 // Bitcast the original vector into a vector of store-size units
34860 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
34861 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
34862 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
34863 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
34864 SmallVector<SDValue, 8> Chains;
34865 SDValue Ptr = St->getBasePtr();
34867 // Perform one or more big stores into memory.
34868 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
34869 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
34870 StoreType, ShuffWide,
34871 DAG.getIntPtrConstant(i, dl));
34873 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
34874 St->getAlignment(), St->getMemOperand()->getFlags());
34875 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
34876 Chains.push_back(Ch);
34879 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
34882 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
34883 // the FP state in cases where an emms may be missing.
34884 // A preferable solution to the general problem is to figure out the right
34885 // places to insert EMMS. This qualifies as a quick hack.
34887 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
34888 if (VT.getSizeInBits() != 64)
34891 const Function &F = DAG.getMachineFunction().getFunction();
34892 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
34894 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
34895 if ((VT.isVector() ||
34896 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
34897 isa<LoadSDNode>(St->getValue()) &&
34898 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
34899 St->getChain().hasOneUse() && !St->isVolatile()) {
34900 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
34901 SmallVector<SDValue, 8> Ops;
34903 if (!ISD::isNormalLoad(Ld))
34906 // If this is not the MMX case, i.e. we are just turning i64 load/store
34907 // into f64 load/store, avoid the transformation if there are multiple
34908 // uses of the loaded value.
34909 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
34914 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
34915 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
34917 if (Subtarget.is64Bit() || F64IsLegal) {
34918 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
34919 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
34920 Ld->getMemOperand());
34922 // Make sure new load is placed in same chain order.
34923 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
34924 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
34925 St->getMemOperand());
34928 // Otherwise, lower to two pairs of 32-bit loads / stores.
34929 SDValue LoAddr = Ld->getBasePtr();
34930 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
34932 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
34933 Ld->getPointerInfo(), Ld->getAlignment(),
34934 Ld->getMemOperand()->getFlags());
34935 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
34936 Ld->getPointerInfo().getWithOffset(4),
34937 MinAlign(Ld->getAlignment(), 4),
34938 Ld->getMemOperand()->getFlags());
34939 // Make sure new loads are placed in same chain order.
34940 DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
34941 DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
34943 LoAddr = St->getBasePtr();
34944 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
34947 DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
34948 St->getAlignment(), St->getMemOperand()->getFlags());
34949 SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
34950 St->getPointerInfo().getWithOffset(4),
34951 MinAlign(St->getAlignment(), 4),
34952 St->getMemOperand()->getFlags());
34953 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
34956 // This is similar to the above case, but here we handle a scalar 64-bit
34957 // integer store that is extracted from a vector on a 32-bit target.
34958 // If we have SSE2, then we can treat it like a floating-point double
34959 // to get past legalization. The execution dependencies fixup pass will
34960 // choose the optimal machine instruction for the store if this really is
34961 // an integer or v2f32 rather than an f64.
34962 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
34963 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
34964 SDValue OldExtract = St->getOperand(1);
34965 SDValue ExtOp0 = OldExtract.getOperand(0);
34966 unsigned VecSize = ExtOp0.getValueSizeInBits();
34967 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
34968 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
34969 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
34970 BitCast, OldExtract.getOperand(1));
34971 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
34972 St->getPointerInfo(), St->getAlignment(),
34973 St->getMemOperand()->getFlags());
34979 /// Return 'true' if this vector operation is "horizontal"
34980 /// and return the operands for the horizontal operation in LHS and RHS. A
34981 /// horizontal operation performs the binary operation on successive elements
34982 /// of its first operand, then on successive elements of its second operand,
34983 /// returning the resulting values in a vector. For example, if
34984 /// A = < float a0, float a1, float a2, float a3 >
34986 /// B = < float b0, float b1, float b2, float b3 >
34987 /// then the result of doing a horizontal operation on A and B is
34988 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
34989 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
34990 /// A horizontal-op B, for some already available A and B, and if so then LHS is
34991 /// set to A, RHS to B, and the routine returns 'true'.
34992 /// Note that the binary operation should have the property that if one of the
34993 /// operands is UNDEF then the result is UNDEF.
34994 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
34995 // Look for the following pattern: if
34996 // A = < float a0, float a1, float a2, float a3 >
34997 // B = < float b0, float b1, float b2, float b3 >
34999 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
35000 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
35001 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
35002 // which is A horizontal-op B.
35004 // At least one of the operands should be a vector shuffle.
35005 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
35006 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
35009 MVT VT = LHS.getSimpleValueType();
35011 assert((VT.is128BitVector() || VT.is256BitVector()) &&
35012 "Unsupported vector type for horizontal add/sub");
35014 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
35015 // operate independently on 128-bit lanes.
35016 unsigned NumElts = VT.getVectorNumElements();
35017 unsigned NumLanes = VT.getSizeInBits()/128;
35018 unsigned NumLaneElts = NumElts / NumLanes;
35019 assert((NumLaneElts % 2 == 0) &&
35020 "Vector type should have an even number of elements in each lane");
35021 unsigned HalfLaneElts = NumLaneElts/2;
35023 // View LHS in the form
35024 // LHS = VECTOR_SHUFFLE A, B, LMask
35025 // If LHS is not a shuffle then pretend it is the shuffle
35026 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
35027 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
35030 SmallVector<int, 16> LMask(NumElts);
35031 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
35032 if (!LHS.getOperand(0).isUndef())
35033 A = LHS.getOperand(0);
35034 if (!LHS.getOperand(1).isUndef())
35035 B = LHS.getOperand(1);
35036 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
35037 std::copy(Mask.begin(), Mask.end(), LMask.begin());
35039 if (!LHS.isUndef())
35041 for (unsigned i = 0; i != NumElts; ++i)
35045 // Likewise, view RHS in the form
35046 // RHS = VECTOR_SHUFFLE C, D, RMask
35048 SmallVector<int, 16> RMask(NumElts);
35049 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
35050 if (!RHS.getOperand(0).isUndef())
35051 C = RHS.getOperand(0);
35052 if (!RHS.getOperand(1).isUndef())
35053 D = RHS.getOperand(1);
35054 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
35055 std::copy(Mask.begin(), Mask.end(), RMask.begin());
35057 if (!RHS.isUndef())
35059 for (unsigned i = 0; i != NumElts; ++i)
35063 // Check that the shuffles are both shuffling the same vectors.
35064 if (!(A == C && B == D) && !(A == D && B == C))
35067 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
35068 if (!A.getNode() && !B.getNode())
35071 // If A and B occur in reverse order in RHS, then "swap" them (which means
35072 // rewriting the mask).
35074 ShuffleVectorSDNode::commuteMask(RMask);
35076 // At this point LHS and RHS are equivalent to
35077 // LHS = VECTOR_SHUFFLE A, B, LMask
35078 // RHS = VECTOR_SHUFFLE A, B, RMask
35079 // Check that the masks correspond to performing a horizontal operation.
35080 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
35081 for (unsigned i = 0; i != NumLaneElts; ++i) {
35082 int LIdx = LMask[i+l], RIdx = RMask[i+l];
35084 // Ignore any UNDEF components.
35085 if (LIdx < 0 || RIdx < 0 ||
35086 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
35087 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
35090 // Check that successive elements are being operated on. If not, this is
35091 // not a horizontal operation.
35092 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
35093 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
35094 if (!(LIdx == Index && RIdx == Index + 1) &&
35095 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
35100 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
35101 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
35105 /// Do target-specific dag combines on floating-point adds/subs.
35106 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
35107 const X86Subtarget &Subtarget) {
35108 EVT VT = N->getValueType(0);
35109 SDValue LHS = N->getOperand(0);
35110 SDValue RHS = N->getOperand(1);
35111 bool IsFadd = N->getOpcode() == ISD::FADD;
35112 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
35114 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
35115 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
35116 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
35117 isHorizontalBinOp(LHS, RHS, IsFadd)) {
35118 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
35119 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
35124 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
35126 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
35127 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
35128 const X86Subtarget &Subtarget,
35130 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
35131 SDValue Src = N->getOperand(0);
35132 unsigned Opcode = Src.getOpcode();
35133 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35135 EVT VT = N->getValueType(0);
35136 EVT SrcVT = Src.getValueType();
35138 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
35139 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
35141 // Repeated operand, so we are only trading one output truncation for
35142 // one input truncation.
35146 // See if either operand has been extended from a smaller/equal size to
35147 // the truncation size, allowing a truncation to combine with the extend.
35148 unsigned Opcode0 = Op0.getOpcode();
35149 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
35150 Opcode0 == ISD::ZERO_EXTEND) &&
35151 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
35154 unsigned Opcode1 = Op1.getOpcode();
35155 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
35156 Opcode1 == ISD::ZERO_EXTEND) &&
35157 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
35160 // See if either operand is a single use constant which can be constant
35162 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
35163 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
35164 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
35165 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
35168 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
35169 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
35170 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
35171 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
35174 // Don't combine if the operation has other uses.
35175 if (!N->isOnlyUserOf(Src.getNode()))
35178 // Only support vector truncation for now.
35179 // TODO: i64 scalar math would benefit as well.
35180 if (!VT.isVector())
35183 // In most cases its only worth pre-truncating if we're only facing the cost
35184 // of one truncation.
35185 // i.e. if one of the inputs will constant fold or the input is repeated.
35190 SDValue Op0 = Src.getOperand(0);
35191 SDValue Op1 = Src.getOperand(1);
35192 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
35193 IsRepeatedOpOrFreeTruncation(Op0, Op1))
35194 return TruncateArithmetic(Op0, Op1);
35199 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
35200 // better to truncate if we have the chance.
35201 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
35202 !Subtarget.hasDQI())
35203 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
35206 // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
35207 SDValue Op0 = Src.getOperand(0);
35208 SDValue Op1 = Src.getOperand(1);
35209 if (TLI.isOperationLegal(Opcode, VT) &&
35210 IsRepeatedOpOrFreeTruncation(Op0, Op1))
35211 return TruncateArithmetic(Op0, Op1);
35219 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
35221 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
35222 SmallVector<SDValue, 8> &Regs) {
35223 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
35224 Regs[0].getValueType() == MVT::v2i64));
35225 EVT OutVT = N->getValueType(0);
35226 EVT OutSVT = OutVT.getVectorElementType();
35227 EVT InVT = Regs[0].getValueType();
35228 EVT InSVT = InVT.getVectorElementType();
35231 // First, use mask to unset all bits that won't appear in the result.
35232 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
35233 "OutSVT can only be either i8 or i16.");
35235 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
35236 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
35237 for (auto &Reg : Regs)
35238 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
35240 MVT UnpackedVT, PackedVT;
35241 if (OutSVT == MVT::i8) {
35242 UnpackedVT = MVT::v8i16;
35243 PackedVT = MVT::v16i8;
35245 UnpackedVT = MVT::v4i32;
35246 PackedVT = MVT::v8i16;
35249 // In each iteration, truncate the type by a half size.
35250 auto RegNum = Regs.size();
35251 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
35252 j < e; j *= 2, RegNum /= 2) {
35253 for (unsigned i = 0; i < RegNum; i++)
35254 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
35255 for (unsigned i = 0; i < RegNum / 2; i++)
35256 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
35260 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
35261 // then extract a subvector as the result since v8i8 is not a legal type.
35262 if (OutVT == MVT::v8i8) {
35263 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
35264 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
35265 DAG.getIntPtrConstant(0, DL));
35267 } else if (RegNum > 1) {
35268 Regs.resize(RegNum);
35269 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
35274 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
35276 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
35278 SmallVector<SDValue, 8> &Regs) {
35279 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
35280 EVT OutVT = N->getValueType(0);
35283 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
35284 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
35285 for (auto &Reg : Regs) {
35286 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
35288 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
35292 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
35293 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
35296 if (Regs.size() > 2) {
35297 Regs.resize(Regs.size() / 2);
35298 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
35303 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
35304 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
35305 /// legalization the truncation will be translated into a BUILD_VECTOR with each
35306 /// element that is extracted from a vector and then truncated, and it is
35307 /// difficult to do this optimization based on them.
35308 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
35309 const X86Subtarget &Subtarget) {
35310 EVT OutVT = N->getValueType(0);
35311 if (!OutVT.isVector())
35314 SDValue In = N->getOperand(0);
35315 if (!In.getValueType().isSimple())
35318 EVT InVT = In.getValueType();
35319 unsigned NumElems = OutVT.getVectorNumElements();
35321 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
35322 // SSE2, and we need to take care of it specially.
35323 // AVX512 provides vpmovdb.
35324 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
35327 EVT OutSVT = OutVT.getVectorElementType();
35328 EVT InSVT = InVT.getVectorElementType();
35329 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
35330 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
35334 // SSSE3's pshufb results in less instructions in the cases below.
35335 if (Subtarget.hasSSSE3() && NumElems == 8 &&
35336 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
35337 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
35342 // Split a long vector into vectors of legal type.
35343 unsigned RegNum = InVT.getSizeInBits() / 128;
35344 SmallVector<SDValue, 8> SubVec(RegNum);
35345 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
35346 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
35348 for (unsigned i = 0; i < RegNum; i++)
35349 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
35350 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
35352 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
35353 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
35354 // truncate 2 x v4i32 to v8i16.
35355 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
35356 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
35357 else if (InSVT == MVT::i32)
35358 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
35363 /// This function transforms vector truncation of 'extended sign-bits' or
35364 /// 'extended zero-bits' values.
35365 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
35366 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
35368 const X86Subtarget &Subtarget) {
35369 // Requires SSE2 but AVX512 has fast truncate.
35370 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
35373 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
35376 SDValue In = N->getOperand(0);
35377 if (!In.getValueType().isSimple())
35380 MVT VT = N->getValueType(0).getSimpleVT();
35381 MVT SVT = VT.getScalarType();
35383 MVT InVT = In.getValueType().getSimpleVT();
35384 MVT InSVT = InVT.getScalarType();
35386 // Check we have a truncation suited for PACKSS.
35387 if (!VT.is128BitVector() && !VT.is256BitVector())
35389 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
35391 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
35394 // Use PACKSS if the input has sign-bits that extend all the way to the
35395 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
35396 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
35397 unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
35398 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
35399 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
35401 // Use PACKUS if the input has zero-bits that extend all the way to the
35402 // packed/truncated value. e.g. masks, zext_in_reg, etc.
35404 DAG.computeKnownBits(In, Known);
35405 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
35406 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
35407 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
35408 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
35413 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
35414 const X86Subtarget &Subtarget) {
35415 EVT VT = N->getValueType(0);
35416 SDValue Src = N->getOperand(0);
35419 // Attempt to pre-truncate inputs to arithmetic ops instead.
35420 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
35423 // Try to detect AVG pattern first.
35424 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
35427 // Try to combine truncation with unsigned saturation.
35428 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
35431 // The bitcast source is a direct mmx result.
35432 // Detect bitcasts between i32 to x86mmx
35433 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
35434 SDValue BCSrc = Src.getOperand(0);
35435 if (BCSrc.getValueType() == MVT::x86mmx)
35436 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
35439 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
35440 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
35443 return combineVectorTruncation(N, DAG, Subtarget);
35446 /// Returns the negated value if the node \p N flips sign of FP value.
35448 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
35449 /// AVX512F does not have FXOR, so FNEG is lowered as
35450 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
35451 /// In this case we go though all bitcasts.
35452 static SDValue isFNEG(SDNode *N) {
35453 if (N->getOpcode() == ISD::FNEG)
35454 return N->getOperand(0);
35456 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
35457 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
35460 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
35461 if (!Op1.getValueType().isFloatingPoint())
35464 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
35466 unsigned EltBits = Op1.getScalarValueSizeInBits();
35467 auto isSignMask = [&](const ConstantFP *C) {
35468 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
35471 // There is more than one way to represent the same constant on
35472 // the different X86 targets. The type of the node may also depend on size.
35473 // - load scalar value and broadcast
35474 // - BUILD_VECTOR node
35475 // - load from a constant pool.
35476 // We check all variants here.
35477 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
35478 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
35479 if (isSignMask(cast<ConstantFP>(C)))
35482 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
35483 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
35484 if (isSignMask(CN->getConstantFPValue()))
35487 } else if (auto *C = getTargetConstantFromNode(Op1)) {
35488 if (C->getType()->isVectorTy()) {
35489 if (auto *SplatV = C->getSplatValue())
35490 if (isSignMask(cast<ConstantFP>(SplatV)))
35492 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
35493 if (isSignMask(FPConst))
35499 /// Do target-specific dag combines on floating point negations.
35500 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
35501 const X86Subtarget &Subtarget) {
35502 EVT OrigVT = N->getValueType(0);
35503 SDValue Arg = isFNEG(N);
35504 assert(Arg.getNode() && "N is expected to be an FNEG node");
35506 EVT VT = Arg.getValueType();
35507 EVT SVT = VT.getScalarType();
35510 // Let legalize expand this if it isn't a legal type yet.
35511 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
35514 // If we're negating a FMUL node on a target with FMA, then we can avoid the
35515 // use of a constant by performing (-0 - A*B) instead.
35516 // FIXME: Check rounding control flags as well once it becomes available.
35517 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
35518 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
35519 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
35520 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
35521 Arg.getOperand(1), Zero);
35522 return DAG.getBitcast(OrigVT, NewNode);
35525 // If we're negating an FMA node, then we can adjust the
35526 // instruction to include the extra negation.
35527 unsigned NewOpcode = 0;
35528 if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
35529 switch (Arg.getOpcode()) {
35530 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
35531 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
35532 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
35533 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
35534 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
35535 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
35536 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
35537 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
35538 // We can't handle scalar intrinsic node here because it would only
35539 // invert one element and not the whole vector. But we could try to handle
35540 // a negation of the lower element only.
35544 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
35545 Arg.getNode()->ops()));
35550 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
35551 const X86Subtarget &Subtarget) {
35552 MVT VT = N->getSimpleValueType(0);
35553 // If we have integer vector types available, use the integer opcodes.
35554 if (VT.isVector() && Subtarget.hasSSE2()) {
35557 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
35559 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
35560 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
35561 unsigned IntOpcode;
35562 switch (N->getOpcode()) {
35563 default: llvm_unreachable("Unexpected FP logic op");
35564 case X86ISD::FOR: IntOpcode = ISD::OR; break;
35565 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
35566 case X86ISD::FAND: IntOpcode = ISD::AND; break;
35567 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
35569 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
35570 return DAG.getBitcast(VT, IntOp);
35576 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
35577 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
35578 if (N->getOpcode() != ISD::XOR)
35581 SDValue LHS = N->getOperand(0);
35582 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
35583 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
35586 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
35587 X86::CondCode(LHS->getConstantOperandVal(0)));
35589 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
35592 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
35593 TargetLowering::DAGCombinerInfo &DCI,
35594 const X86Subtarget &Subtarget) {
35595 // If this is SSE1 only convert to FXOR to avoid scalarization.
35596 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
35597 N->getValueType(0) == MVT::v4i32) {
35598 return DAG.getBitcast(
35599 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
35600 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
35601 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
35604 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
35607 if (DCI.isBeforeLegalizeOps())
35610 if (SDValue SetCC = foldXor1SetCC(N, DAG))
35613 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
35616 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
35620 return combineFneg(N, DAG, Subtarget);
35625 static bool isNullFPScalarOrVectorConst(SDValue V) {
35626 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
35629 /// If a value is a scalar FP zero or a vector FP zero (potentially including
35630 /// undefined elements), return a zero constant that may be used to fold away
35631 /// that value. In the case of a vector, the returned constant will not contain
35632 /// undefined elements even if the input parameter does. This makes it suitable
35633 /// to be used as a replacement operand with operations (eg, bitwise-and) where
35634 /// an undef should not propagate.
35635 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
35636 const X86Subtarget &Subtarget) {
35637 if (!isNullFPScalarOrVectorConst(V))
35640 if (V.getValueType().isVector())
35641 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
35646 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
35647 const X86Subtarget &Subtarget) {
35648 SDValue N0 = N->getOperand(0);
35649 SDValue N1 = N->getOperand(1);
35650 EVT VT = N->getValueType(0);
35653 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
35654 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
35655 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
35656 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
35659 auto isAllOnesConstantFP = [](SDValue V) {
35660 if (V.getSimpleValueType().isVector())
35661 return ISD::isBuildVectorAllOnes(V.getNode());
35662 auto *C = dyn_cast<ConstantFPSDNode>(V);
35663 return C && C->getConstantFPValue()->isAllOnesValue();
35666 // fand (fxor X, -1), Y --> fandn X, Y
35667 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
35668 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
35670 // fand X, (fxor Y, -1) --> fandn Y, X
35671 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
35672 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
35677 /// Do target-specific dag combines on X86ISD::FAND nodes.
35678 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
35679 const X86Subtarget &Subtarget) {
35680 // FAND(0.0, x) -> 0.0
35681 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
35684 // FAND(x, 0.0) -> 0.0
35685 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
35688 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
35691 return lowerX86FPLogicOp(N, DAG, Subtarget);
35694 /// Do target-specific dag combines on X86ISD::FANDN nodes.
35695 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
35696 const X86Subtarget &Subtarget) {
35697 // FANDN(0.0, x) -> x
35698 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
35699 return N->getOperand(1);
35701 // FANDN(x, 0.0) -> 0.0
35702 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
35705 return lowerX86FPLogicOp(N, DAG, Subtarget);
35708 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
35709 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
35710 const X86Subtarget &Subtarget) {
35711 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
35713 // F[X]OR(0.0, x) -> x
35714 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
35715 return N->getOperand(1);
35717 // F[X]OR(x, 0.0) -> x
35718 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
35719 return N->getOperand(0);
35722 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
35725 return lowerX86FPLogicOp(N, DAG, Subtarget);
35728 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
35729 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
35730 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
35732 // Only perform optimizations if UnsafeMath is used.
35733 if (!DAG.getTarget().Options.UnsafeFPMath)
35736 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
35737 // into FMINC and FMAXC, which are Commutative operations.
35738 unsigned NewOp = 0;
35739 switch (N->getOpcode()) {
35740 default: llvm_unreachable("unknown opcode");
35741 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
35742 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
35745 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
35746 N->getOperand(0), N->getOperand(1));
35749 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
35750 const X86Subtarget &Subtarget) {
35751 if (Subtarget.useSoftFloat())
35754 // TODO: Check for global or instruction-level "nnan". In that case, we
35755 // should be able to lower to FMAX/FMIN alone.
35756 // TODO: If an operand is already known to be a NaN or not a NaN, this
35757 // should be an optional swap and FMAX/FMIN.
35759 EVT VT = N->getValueType(0);
35760 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
35761 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
35762 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
35765 // This takes at least 3 instructions, so favor a library call when operating
35766 // on a scalar and minimizing code size.
35767 if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
35770 SDValue Op0 = N->getOperand(0);
35771 SDValue Op1 = N->getOperand(1);
35773 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
35774 DAG.getDataLayout(), *DAG.getContext(), VT);
35776 // There are 4 possibilities involving NaN inputs, and these are the required
35780 // ----------------
35781 // Num | Max | Op0 |
35782 // Op0 ----------------
35783 // NaN | Op1 | NaN |
35784 // ----------------
35786 // The SSE FP max/min instructions were not designed for this case, but rather
35788 // Min = Op1 < Op0 ? Op1 : Op0
35789 // Max = Op1 > Op0 ? Op1 : Op0
35791 // So they always return Op0 if either input is a NaN. However, we can still
35792 // use those instructions for fmaxnum by selecting away a NaN input.
35794 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
35795 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
35796 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
35797 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
35799 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
35800 // are NaN, the NaN value of Op1 is the result.
35801 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
35804 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
35805 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
35806 TargetLowering::DAGCombinerInfo &DCI,
35807 const X86Subtarget &Subtarget) {
35808 // ANDNP(0, x) -> x
35809 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
35810 return N->getOperand(1);
35812 // ANDNP(x, 0) -> 0
35813 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
35814 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
35816 EVT VT = N->getValueType(0);
35818 // Attempt to recursively combine a bitmask ANDNP with shuffles.
35819 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
35821 if (SDValue Res = combineX86ShufflesRecursively(
35822 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
35823 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
35824 DCI.CombineTo(N, Res);
35832 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
35833 TargetLowering::DAGCombinerInfo &DCI) {
35834 SDValue N0 = N->getOperand(0);
35835 SDValue N1 = N->getOperand(1);
35837 // BT ignores high bits in the bit index operand.
35838 unsigned BitWidth = N1.getValueSizeInBits();
35839 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
35840 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
35841 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
35846 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
35847 const X86Subtarget &Subtarget) {
35848 EVT VT = N->getValueType(0);
35849 if (!VT.isVector())
35852 SDValue N0 = N->getOperand(0);
35853 SDValue N1 = N->getOperand(1);
35854 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
35857 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
35858 // both SSE and AVX2 since there is no sign-extended shift right
35859 // operation on a vector with 64-bit elements.
35860 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
35861 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
35862 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
35863 N0.getOpcode() == ISD::SIGN_EXTEND)) {
35864 SDValue N00 = N0.getOperand(0);
35866 // EXTLOAD has a better solution on AVX2,
35867 // it may be replaced with X86ISD::VSEXT node.
35868 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
35869 if (!ISD::isNormalLoad(N00.getNode()))
35872 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
35873 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
35875 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
35881 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
35882 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
35883 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
35884 /// opportunities to combine math ops, use an LEA, or use a complex addressing
35885 /// mode. This can eliminate extend, add, and shift instructions.
35886 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
35887 const X86Subtarget &Subtarget) {
35888 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
35889 Ext->getOpcode() != ISD::ZERO_EXTEND)
35892 // TODO: This should be valid for other integer types.
35893 EVT VT = Ext->getValueType(0);
35894 if (VT != MVT::i64)
35897 SDValue Add = Ext->getOperand(0);
35898 if (Add.getOpcode() != ISD::ADD)
35901 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
35902 bool NSW = Add->getFlags().hasNoSignedWrap();
35903 bool NUW = Add->getFlags().hasNoUnsignedWrap();
35905 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
35907 if ((Sext && !NSW) || (!Sext && !NUW))
35910 // Having a constant operand to the 'add' ensures that we are not increasing
35911 // the instruction count because the constant is extended for free below.
35912 // A constant operand can also become the displacement field of an LEA.
35913 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
35917 // Don't make the 'add' bigger if there's no hope of combining it with some
35918 // other 'add' or 'shl' instruction.
35919 // TODO: It may be profitable to generate simpler LEA instructions in place
35920 // of single 'add' instructions, but the cost model for selecting an LEA
35921 // currently has a high threshold.
35922 bool HasLEAPotential = false;
35923 for (auto *User : Ext->uses()) {
35924 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
35925 HasLEAPotential = true;
35929 if (!HasLEAPotential)
35932 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
35933 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
35934 SDValue AddOp0 = Add.getOperand(0);
35935 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
35936 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
35938 // The wider add is guaranteed to not wrap because both operands are
35941 Flags.setNoSignedWrap(NSW);
35942 Flags.setNoUnsignedWrap(NUW);
35943 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
35946 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
35947 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
35948 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
35949 /// extends from AH (which we otherwise need to do contortions to access).
35950 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
35951 SDValue N0 = N->getOperand(0);
35952 auto OpcodeN = N->getOpcode();
35953 auto OpcodeN0 = N0.getOpcode();
35954 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
35955 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
35958 EVT VT = N->getValueType(0);
35959 EVT InVT = N0.getValueType();
35960 if (N0.getResNo() != 1 || InVT != MVT::i8 ||
35961 !(VT == MVT::i32 || VT == MVT::i64))
35964 SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
35965 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
35966 : X86ISD::UDIVREM8_ZEXT_HREG;
35967 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
35969 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
35970 // If this was a 64-bit extend, complete it.
35971 if (VT == MVT::i64)
35972 return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
35973 return R.getValue(1);
35976 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
35977 // operands and the result of CMOV is not used anywhere else - promote CMOV
35978 // itself instead of promoting its result. This could be beneficial, because:
35979 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
35980 // (or more) pseudo-CMOVs only when they go one-after-another and
35981 // getting rid of result extension code after CMOV will help that.
35982 // 2) Promotion of constant CMOV arguments is free, hence the
35983 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
35984 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
35985 // promotion is also good in terms of code-size.
35986 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
35988 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
35989 SDValue CMovN = Extend->getOperand(0);
35990 if (CMovN.getOpcode() != X86ISD::CMOV)
35993 EVT TargetVT = Extend->getValueType(0);
35994 unsigned ExtendOpcode = Extend->getOpcode();
35997 EVT VT = CMovN.getValueType();
35998 SDValue CMovOp0 = CMovN.getOperand(0);
35999 SDValue CMovOp1 = CMovN.getOperand(1);
36001 bool DoPromoteCMOV =
36002 (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) &&
36003 CMovN.hasOneUse() &&
36004 (isa<ConstantSDNode>(CMovOp0.getNode()) &&
36005 isa<ConstantSDNode>(CMovOp1.getNode()));
36007 if (!DoPromoteCMOV)
36010 CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
36011 CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);
36013 return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
36014 CMovN.getOperand(2), CMovN.getOperand(3));
36017 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
36018 // This is more or less the reverse of combineBitcastvxi1.
36020 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
36021 TargetLowering::DAGCombinerInfo &DCI,
36022 const X86Subtarget &Subtarget) {
36023 unsigned Opcode = N->getOpcode();
36024 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
36025 Opcode != ISD::ANY_EXTEND)
36027 if (!DCI.isBeforeLegalizeOps())
36029 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
36032 SDValue N0 = N->getOperand(0);
36033 EVT VT = N->getValueType(0);
36034 EVT SVT = VT.getScalarType();
36035 EVT InSVT = N0.getValueType().getScalarType();
36036 unsigned EltSizeInBits = SVT.getSizeInBits();
36038 // Input type must be extending a bool vector (bit-casted from a scalar
36039 // integer) to legal integer types.
36040 if (!VT.isVector())
36042 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
36044 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
36047 SDValue N00 = N0.getOperand(0);
36048 EVT SclVT = N0.getOperand(0).getValueType();
36049 if (!SclVT.isScalarInteger())
36054 SmallVector<int, 32> ShuffleMask;
36055 unsigned NumElts = VT.getVectorNumElements();
36056 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
36058 // Broadcast the scalar integer to the vector elements.
36059 if (NumElts > EltSizeInBits) {
36060 // If the scalar integer is greater than the vector element size, then we
36061 // must split it down into sub-sections for broadcasting. For example:
36062 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
36063 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
36064 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
36065 unsigned Scale = NumElts / EltSizeInBits;
36067 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
36068 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
36069 Vec = DAG.getBitcast(VT, Vec);
36071 for (unsigned i = 0; i != Scale; ++i)
36072 ShuffleMask.append(EltSizeInBits, i);
36074 // For smaller scalar integers, we can simply any-extend it to the vector
36075 // element size (we don't care about the upper bits) and broadcast it to all
36077 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
36078 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
36079 ShuffleMask.append(NumElts, 0);
36081 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
36083 // Now, mask the relevant bit in each element.
36084 SmallVector<SDValue, 32> Bits;
36085 for (unsigned i = 0; i != NumElts; ++i) {
36086 int BitIdx = (i % EltSizeInBits);
36087 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
36088 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
36090 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
36091 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
36093 // Compare against the bitmask and extend the result.
36094 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
36095 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
36096 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
36098 // For SEXT, this is now done, otherwise shift the result down for
36100 if (Opcode == ISD::SIGN_EXTEND)
36102 return DAG.getNode(ISD::SRL, DL, VT, Vec,
36103 DAG.getConstant(EltSizeInBits - 1, DL, VT));
36106 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
36107 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
36108 /// with UNDEFs) of the input to vectors of the same size as the target type
36109 /// which then extends the lowest elements.
36110 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
36111 TargetLowering::DAGCombinerInfo &DCI,
36112 const X86Subtarget &Subtarget) {
36113 unsigned Opcode = N->getOpcode();
36114 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
36116 if (!DCI.isBeforeLegalizeOps())
36118 if (!Subtarget.hasSSE2())
36121 SDValue N0 = N->getOperand(0);
36122 EVT VT = N->getValueType(0);
36123 EVT SVT = VT.getScalarType();
36124 EVT InVT = N0.getValueType();
36125 EVT InSVT = InVT.getScalarType();
36127 // Input type must be a vector and we must be extending legal integer types.
36128 if (!VT.isVector())
36130 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
36132 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
36135 // On AVX2+ targets, if the input/output types are both legal then we will be
36136 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
36137 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
36138 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
36143 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
36144 EVT InVT = N.getValueType();
36145 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
36146 Size / InVT.getScalarSizeInBits());
36147 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
36148 DAG.getUNDEF(InVT));
36150 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
36153 // If target-size is less than 128-bits, extend to a type that would extend
36154 // to 128 bits, extend that and extract the original target vector.
36155 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
36156 unsigned Scale = 128 / VT.getSizeInBits();
36158 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
36159 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
36160 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
36161 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
36162 DAG.getIntPtrConstant(0, DL));
36165 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
36166 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
36167 // Also use this if we don't have SSE41 to allow the legalizer do its job.
36168 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
36169 (VT.is256BitVector() && Subtarget.hasInt256()) ||
36170 (VT.is512BitVector() && Subtarget.hasAVX512())) {
36171 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
36172 return Opcode == ISD::SIGN_EXTEND
36173 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
36174 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
36177 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
36178 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
36179 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
36180 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
36181 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
36183 SmallVector<SDValue, 8> Opnds;
36184 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
36185 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
36186 DAG.getIntPtrConstant(Offset, DL));
36187 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
36188 SrcVec = Opcode == ISD::SIGN_EXTEND
36189 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
36190 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
36191 Opnds.push_back(SrcVec);
36193 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
36196 // On pre-AVX2 targets, split into 128-bit nodes of
36197 // ISD::*_EXTEND_VECTOR_INREG.
36198 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
36199 return SplitAndExtendInReg(128);
36201 // On pre-AVX512 targets, split into 256-bit nodes of
36202 // ISD::*_EXTEND_VECTOR_INREG.
36203 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
36204 return SplitAndExtendInReg(256);
36209 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
36210 TargetLowering::DAGCombinerInfo &DCI,
36211 const X86Subtarget &Subtarget) {
36212 SDValue N0 = N->getOperand(0);
36213 EVT VT = N->getValueType(0);
36214 EVT InVT = N0.getValueType();
36217 if (SDValue DivRem8 = getDivRem8(N, DAG))
36220 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
36223 if (!DCI.isBeforeLegalizeOps())
36226 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
36227 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
36228 // Invert and sign-extend a boolean is the same as zero-extend and subtract
36229 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
36230 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
36231 // sext (xor Bool, -1) --> sub (zext Bool), 1
36232 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
36233 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
36236 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
36239 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
36243 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
36246 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
36252 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
36253 const X86Subtarget &Subtarget) {
36254 // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
36256 EVT VT = N->getValueType(0);
36258 // Let legalize expand this if it isn't a legal type yet.
36259 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
36262 EVT ScalarVT = VT.getScalarType();
36263 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
36266 SDValue A = N->getOperand(0);
36267 SDValue B = N->getOperand(1);
36268 SDValue C = N->getOperand(2);
36270 auto invertIfNegative = [](SDValue &V) {
36271 if (SDValue NegVal = isFNEG(V.getNode())) {
36278 // Do not convert the passthru input of scalar intrinsics.
36279 // FIXME: We could allow negations of the lower element only.
36280 bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
36281 N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
36282 bool NegB = invertIfNegative(B);
36283 bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
36284 N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
36286 // Negative multiplication when NegA xor NegB
36287 bool NegMul = (NegA != NegB);
36288 bool HasNeg = NegA || NegB || NegC;
36290 unsigned NewOpcode;
36292 NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
36294 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
36296 // For FMA, we risk reconstructing the node we started with.
36297 // In order to avoid this, we check for negation or opcode change. If
36298 // one of the two happened, then it is a new node and we return it.
36299 if (N->getOpcode() == ISD::FMA) {
36300 if (HasNeg || NewOpcode != N->getOpcode())
36301 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
36305 if (N->getOpcode() == X86ISD::FMADD_RND) {
36306 switch (NewOpcode) {
36307 case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
36308 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
36309 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
36310 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
36312 } else if (N->getOpcode() == X86ISD::FMADDS1) {
36313 switch (NewOpcode) {
36314 case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
36315 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
36316 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
36317 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
36319 } else if (N->getOpcode() == X86ISD::FMADDS3) {
36320 switch (NewOpcode) {
36321 case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
36322 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
36323 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
36324 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
36326 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
36327 switch (NewOpcode) {
36328 case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
36329 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
36330 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
36331 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
36333 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
36334 switch (NewOpcode) {
36335 case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
36336 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
36337 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
36338 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
36340 } else if (N->getOpcode() == X86ISD::FMADD4S) {
36341 switch (NewOpcode) {
36342 case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
36343 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
36344 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
36345 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
36348 llvm_unreachable("Unexpected opcode!");
36351 // Only return the node is the opcode was changed or one of the
36352 // operand was negated. If not, we'll just recreate the same node.
36353 if (HasNeg || NewOpcode != N->getOpcode()) {
36354 if (N->getNumOperands() == 4)
36355 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
36356 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
36362 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
36363 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
36364 const X86Subtarget &Subtarget) {
36366 EVT VT = N->getValueType(0);
36368 SDValue NegVal = isFNEG(N->getOperand(2).getNode());
36372 unsigned NewOpcode;
36373 switch (N->getOpcode()) {
36374 default: llvm_unreachable("Unexpected opcode!");
36375 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
36376 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
36377 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
36378 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
36381 if (N->getNumOperands() == 4)
36382 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
36383 NegVal, N->getOperand(3));
36384 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
36388 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
36389 TargetLowering::DAGCombinerInfo &DCI,
36390 const X86Subtarget &Subtarget) {
36391 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
36392 // (and (i32 x86isd::setcc_carry), 1)
36393 // This eliminates the zext. This transformation is necessary because
36394 // ISD::SETCC is always legalized to i8.
36396 SDValue N0 = N->getOperand(0);
36397 EVT VT = N->getValueType(0);
36399 if (N0.getOpcode() == ISD::AND &&
36401 N0.getOperand(0).hasOneUse()) {
36402 SDValue N00 = N0.getOperand(0);
36403 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
36404 if (!isOneConstant(N0.getOperand(1)))
36406 return DAG.getNode(ISD::AND, dl, VT,
36407 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
36408 N00.getOperand(0), N00.getOperand(1)),
36409 DAG.getConstant(1, dl, VT));
36413 if (N0.getOpcode() == ISD::TRUNCATE &&
36415 N0.getOperand(0).hasOneUse()) {
36416 SDValue N00 = N0.getOperand(0);
36417 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
36418 return DAG.getNode(ISD::AND, dl, VT,
36419 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
36420 N00.getOperand(0), N00.getOperand(1)),
36421 DAG.getConstant(1, dl, VT));
36425 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
36428 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
36431 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
36435 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
36438 if (SDValue DivRem8 = getDivRem8(N, DAG))
36441 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
36444 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
36450 /// Try to map a 128-bit or larger integer comparison to vector instructions
36451 /// before type legalization splits it up into chunks.
36452 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
36453 const X86Subtarget &Subtarget) {
36454 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
36455 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
36457 // We're looking for an oversized integer equality comparison.
36458 SDValue X = SetCC->getOperand(0);
36459 SDValue Y = SetCC->getOperand(1);
36460 EVT OpVT = X.getValueType();
36461 unsigned OpSize = OpVT.getSizeInBits();
36462 if (!OpVT.isScalarInteger() || OpSize < 128)
36465 // Ignore a comparison with zero because that gets special treatment in
36466 // EmitTest(). But make an exception for the special case of a pair of
36467 // logically-combined vector-sized operands compared to zero. This pattern may
36468 // be generated by the memcmp expansion pass with oversized integer compares
36470 bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
36471 X.getOperand(0).getOpcode() == ISD::XOR &&
36472 X.getOperand(1).getOpcode() == ISD::XOR;
36473 if (isNullConstant(Y) && !IsOrXorXorCCZero)
36476 // Bail out if we know that this is not really just an oversized integer.
36477 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
36478 peekThroughBitcasts(Y).getValueType() == MVT::f128)
36481 // TODO: Use PXOR + PTEST for SSE4.1 or later?
36482 // TODO: Add support for AVX-512.
36483 EVT VT = SetCC->getValueType(0);
36485 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
36486 (OpSize == 256 && Subtarget.hasAVX2())) {
36487 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
36489 if (IsOrXorXorCCZero) {
36490 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
36491 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
36492 // Use 2 vector equality compares and 'and' the results before doing a
36494 SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
36495 SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
36496 SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
36497 SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
36498 SDValue Cmp1 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, A, B);
36499 SDValue Cmp2 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, C, D);
36500 Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
36502 SDValue VecX = DAG.getBitcast(VecVT, X);
36503 SDValue VecY = DAG.getBitcast(VecVT, Y);
36504 Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
36506 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
36507 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
36508 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
36509 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
36510 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
36511 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
36512 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
36514 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
36520 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
36521 const X86Subtarget &Subtarget) {
36522 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
36523 SDValue LHS = N->getOperand(0);
36524 SDValue RHS = N->getOperand(1);
36525 EVT VT = N->getValueType(0);
36528 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
36529 EVT OpVT = LHS.getValueType();
36530 // 0-x == y --> x+y == 0
36531 // 0-x != y --> x+y != 0
36532 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
36534 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
36535 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
36537 // x == 0-y --> x+y == 0
36538 // x != 0-y --> x+y != 0
36539 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
36541 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
36542 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
36545 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
36549 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
36550 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
36551 // Put build_vectors on the right.
36552 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
36553 std::swap(LHS, RHS);
36554 CC = ISD::getSetCCSwappedOperands(CC);
36558 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
36559 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
36560 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
36562 if (IsSEXT0 && IsVZero1) {
36563 assert(VT == LHS.getOperand(0).getValueType() &&
36564 "Uexpected operand type");
36565 if (CC == ISD::SETGT)
36566 return DAG.getConstant(0, DL, VT);
36567 if (CC == ISD::SETLE)
36568 return DAG.getConstant(1, DL, VT);
36569 if (CC == ISD::SETEQ || CC == ISD::SETGE)
36570 return DAG.getNOT(DL, LHS.getOperand(0), VT);
36572 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
36573 "Unexpected condition code!");
36574 return LHS.getOperand(0);
36578 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
36579 // to avoid scalarization via legalization because v4i32 is not a legal type.
36580 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
36581 LHS.getValueType() == MVT::v4f32)
36582 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
36587 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
36588 TargetLowering::DAGCombinerInfo &DCI) {
36589 SDValue Src = N->getOperand(0);
36590 MVT SrcVT = Src.getSimpleValueType();
36592 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36593 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36594 !DCI.isBeforeLegalizeOps());
36596 // MOVMSK only uses the MSB from each vector element.
36598 APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
36599 if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
36600 DCI.AddToWorklist(Src.getNode());
36601 DCI.CommitTargetLoweringOpt(TLO);
36602 return SDValue(N, 0);
36608 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
36609 TargetLowering::DAGCombinerInfo &DCI,
36610 const X86Subtarget &Subtarget) {
36613 if (DCI.isBeforeLegalizeOps()) {
36614 SDValue Index = N->getOperand(4);
36615 // Remove any sign extends from 32 or smaller to larger than 32.
36616 // Only do this before LegalizeOps in case we need the sign extend for
36618 if (Index.getOpcode() == ISD::SIGN_EXTEND) {
36619 if (Index.getScalarValueSizeInBits() > 32 &&
36620 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
36621 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36622 NewOps[4] = Index.getOperand(0);
36623 DAG.UpdateNodeOperands(N, NewOps);
36624 // The original sign extend has less users, add back to worklist in case
36625 // it needs to be removed
36626 DCI.AddToWorklist(Index.getNode());
36627 DCI.AddToWorklist(N);
36628 return SDValue(N, 0);
36632 // Make sure the index is either i32 or i64
36633 unsigned ScalarSize = Index.getScalarValueSizeInBits();
36634 if (ScalarSize != 32 && ScalarSize != 64) {
36635 MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
36636 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
36637 Index.getValueType().getVectorNumElements());
36638 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
36639 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36641 DAG.UpdateNodeOperands(N, NewOps);
36642 DCI.AddToWorklist(N);
36643 return SDValue(N, 0);
36646 // Try to remove zero extends from 32->64 if we know the sign bit of
36647 // the input is zero.
36648 if (Index.getOpcode() == ISD::ZERO_EXTEND &&
36649 Index.getScalarValueSizeInBits() == 64 &&
36650 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
36651 if (DAG.SignBitIsZero(Index.getOperand(0))) {
36652 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36653 NewOps[4] = Index.getOperand(0);
36654 DAG.UpdateNodeOperands(N, NewOps);
36655 // The original zero extend has less users, add back to worklist in case
36656 // it needs to be removed
36657 DCI.AddToWorklist(Index.getNode());
36658 DCI.AddToWorklist(N);
36659 return SDValue(N, 0);
36664 // Gather and Scatter instructions use k-registers for masks. The type of
36665 // the masks is v*i1. So the mask will be truncated anyway.
36666 // The SIGN_EXTEND_INREG my be dropped.
36667 SDValue Mask = N->getOperand(2);
36668 if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
36669 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36670 NewOps[2] = Mask.getOperand(0);
36671 DAG.UpdateNodeOperands(N, NewOps);
36672 return SDValue(N, 0);
36675 // With AVX2 we only demand the upper bit of the mask.
36676 if (!Subtarget.hasAVX512()) {
36677 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36678 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36679 !DCI.isBeforeLegalizeOps());
36681 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
36682 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
36683 DCI.AddToWorklist(Mask.getNode());
36684 DCI.CommitTargetLoweringOpt(TLO);
36685 return SDValue(N, 0);
36692 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
36693 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
36694 const X86Subtarget &Subtarget) {
36696 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
36697 SDValue EFLAGS = N->getOperand(1);
36699 // Try to simplify the EFLAGS and condition code operands.
36700 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
36701 return getSETCC(CC, Flags, DL, DAG);
36706 /// Optimize branch condition evaluation.
36707 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
36708 const X86Subtarget &Subtarget) {
36710 SDValue EFLAGS = N->getOperand(3);
36711 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
36713 // Try to simplify the EFLAGS and condition code operands.
36714 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
36715 // RAUW them under us.
36716 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
36717 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
36718 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
36719 N->getOperand(1), Cond, Flags);
36725 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
36726 SelectionDAG &DAG) {
36727 // Take advantage of vector comparisons producing 0 or -1 in each lane to
36728 // optimize away operation when it's from a constant.
36730 // The general transformation is:
36731 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
36732 // AND(VECTOR_CMP(x,y), constant2)
36733 // constant2 = UNARYOP(constant)
36735 // Early exit if this isn't a vector operation, the operand of the
36736 // unary operation isn't a bitwise AND, or if the sizes of the operations
36737 // aren't the same.
36738 EVT VT = N->getValueType(0);
36739 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
36740 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
36741 VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
36744 // Now check that the other operand of the AND is a constant. We could
36745 // make the transformation for non-constant splats as well, but it's unclear
36746 // that would be a benefit as it would not eliminate any operations, just
36747 // perform one more step in scalar code before moving to the vector unit.
36748 if (BuildVectorSDNode *BV =
36749 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
36750 // Bail out if the vector isn't a constant.
36751 if (!BV->isConstant())
36754 // Everything checks out. Build up the new and improved node.
36756 EVT IntVT = BV->getValueType(0);
36757 // Create a new constant of the appropriate type for the transformed
36759 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
36760 // The AND node needs bitcasts to/from an integer vector type around it.
36761 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
36762 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
36763 N->getOperand(0)->getOperand(0), MaskConst);
36764 SDValue Res = DAG.getBitcast(VT, NewAnd);
36771 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
36772 const X86Subtarget &Subtarget) {
36773 SDValue Op0 = N->getOperand(0);
36774 EVT VT = N->getValueType(0);
36775 EVT InVT = Op0.getValueType();
36776 EVT InSVT = InVT.getScalarType();
36778 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
36779 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
36780 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
36782 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36783 InVT.getVectorNumElements());
36784 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
36786 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
36787 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
36790 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
36791 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
36792 // the optimization here.
36793 if (DAG.SignBitIsZero(Op0))
36794 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
36799 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
36800 const X86Subtarget &Subtarget) {
36801 // First try to optimize away the conversion entirely when it's
36802 // conditionally from a constant. Vectors only.
36803 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
36806 // Now move on to more general possibilities.
36807 SDValue Op0 = N->getOperand(0);
36808 EVT VT = N->getValueType(0);
36809 EVT InVT = Op0.getValueType();
36810 EVT InSVT = InVT.getScalarType();
36812 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
36813 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
36814 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
36815 if (InVT.isVector() &&
36816 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
36817 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
36819 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36820 InVT.getVectorNumElements());
36821 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
36822 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
36825 // Without AVX512DQ we only support i64 to float scalar conversion. For both
36826 // vectors and scalars, see if we know that the upper bits are all the sign
36827 // bit, in which case we can truncate the input to i32 and convert from that.
36828 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
36829 unsigned BitWidth = InVT.getScalarSizeInBits();
36830 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
36831 if (NumSignBits >= (BitWidth - 31)) {
36832 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
36833 if (InVT.isVector())
36834 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
36835 InVT.getVectorNumElements());
36837 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
36838 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
36842 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
36843 // a 32-bit target where SSE doesn't support i64->FP operations.
36844 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
36845 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
36846 EVT LdVT = Ld->getValueType(0);
36848 // This transformation is not supported if the result type is f16 or f128.
36849 if (VT == MVT::f16 || VT == MVT::f128)
36852 if (!Ld->isVolatile() && !VT.isVector() &&
36853 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
36854 !Subtarget.is64Bit() && LdVT == MVT::i64) {
36855 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
36856 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
36857 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
36864 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
36865 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
36866 MVT VT = N->getSimpleValueType(0);
36867 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36868 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
36869 N->getOperand(0), N->getOperand(1),
36876 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
36877 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
36878 TargetLowering::DAGCombinerInfo &DCI) {
36879 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
36880 // the result is either zero or one (depending on the input carry bit).
36881 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
36882 if (X86::isZeroNode(N->getOperand(0)) &&
36883 X86::isZeroNode(N->getOperand(1)) &&
36884 // We don't have a good way to replace an EFLAGS use, so only do this when
36886 SDValue(N, 1).use_empty()) {
36888 EVT VT = N->getValueType(0);
36889 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
36890 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
36891 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36892 DAG.getConstant(X86::COND_B, DL,
36895 DAG.getConstant(1, DL, VT));
36896 return DCI.CombineTo(N, Res1, CarryOut);
36899 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
36900 MVT VT = N->getSimpleValueType(0);
36901 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36902 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
36903 N->getOperand(0), N->getOperand(1),
36910 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
36911 /// which is more useful than 0/1 in some cases.
36912 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
36914 // "Condition code B" is also known as "the carry flag" (CF).
36915 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
36916 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
36917 MVT VT = N->getSimpleValueType(0);
36919 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
36921 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
36922 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
36925 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
36926 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
36927 /// with CMP+{ADC, SBB}.
36928 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
36929 bool IsSub = N->getOpcode() == ISD::SUB;
36930 SDValue X = N->getOperand(0);
36931 SDValue Y = N->getOperand(1);
36933 // If this is an add, canonicalize a zext operand to the RHS.
36934 // TODO: Incomplete? What if both sides are zexts?
36935 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
36936 Y.getOpcode() != ISD::ZERO_EXTEND)
36939 // Look through a one-use zext.
36940 bool PeekedThroughZext = false;
36941 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
36942 Y = Y.getOperand(0);
36943 PeekedThroughZext = true;
36946 // If this is an add, canonicalize a setcc operand to the RHS.
36947 // TODO: Incomplete? What if both sides are setcc?
36948 // TODO: Should we allow peeking through a zext of the other operand?
36949 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
36950 Y.getOpcode() != X86ISD::SETCC)
36953 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
36957 EVT VT = N->getValueType(0);
36958 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
36960 // If X is -1 or 0, then we have an opportunity to avoid constants required in
36961 // the general case below.
36962 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
36964 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
36965 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
36966 // This is a complicated way to get -1 or 0 from the carry flag:
36967 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
36968 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
36969 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36970 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36974 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
36975 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
36976 SDValue EFLAGS = Y->getOperand(1);
36977 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
36978 EFLAGS.getValueType().isInteger() &&
36979 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
36980 // Swap the operands of a SUB, and we have the same pattern as above.
36981 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
36982 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
36983 SDValue NewSub = DAG.getNode(
36984 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
36985 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
36986 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
36987 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36988 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36994 if (CC == X86::COND_B) {
36995 // X + SETB Z --> X + (mask SBB Z, Z)
36996 // X - SETB Z --> X - (mask SBB Z, Z)
36997 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
36998 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
36999 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
37000 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
37001 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
37004 if (CC == X86::COND_A) {
37005 SDValue EFLAGS = Y->getOperand(1);
37006 // Try to convert COND_A into COND_B in an attempt to facilitate
37007 // materializing "setb reg".
37009 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
37010 // cannot take an immediate as its first operand.
37012 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
37013 EFLAGS.getValueType().isInteger() &&
37014 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
37015 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
37016 EFLAGS.getNode()->getVTList(),
37017 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
37018 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
37019 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
37020 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
37021 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
37022 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
37026 if (CC != X86::COND_E && CC != X86::COND_NE)
37029 SDValue Cmp = Y.getOperand(1);
37030 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
37031 !X86::isZeroNode(Cmp.getOperand(1)) ||
37032 !Cmp.getOperand(0).getValueType().isInteger())
37035 SDValue Z = Cmp.getOperand(0);
37036 EVT ZVT = Z.getValueType();
37038 // If X is -1 or 0, then we have an opportunity to avoid constants required in
37039 // the general case below.
37041 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
37043 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
37044 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
37045 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
37046 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
37047 SDValue Zero = DAG.getConstant(0, DL, ZVT);
37048 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
37049 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
37050 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37051 DAG.getConstant(X86::COND_B, DL, MVT::i8),
37052 SDValue(Neg.getNode(), 1));
37055 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
37056 // with fake operands:
37057 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
37058 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
37059 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
37060 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
37061 SDValue One = DAG.getConstant(1, DL, ZVT);
37062 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
37063 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37064 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
37068 // (cmp Z, 1) sets the carry flag if Z is 0.
37069 SDValue One = DAG.getConstant(1, DL, ZVT);
37070 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
37072 // Add the flags type for ADC/SBB nodes.
37073 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
37075 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
37076 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
37077 if (CC == X86::COND_NE)
37078 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
37079 DAG.getConstant(-1ULL, DL, VT), Cmp1);
37081 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
37082 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
37083 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
37084 DAG.getConstant(0, DL, VT), Cmp1);
37087 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
37088 const X86Subtarget &Subtarget) {
37089 if (!Subtarget.hasSSE2())
37092 SDValue MulOp = N->getOperand(0);
37093 SDValue Phi = N->getOperand(1);
37095 if (MulOp.getOpcode() != ISD::MUL)
37096 std::swap(MulOp, Phi);
37097 if (MulOp.getOpcode() != ISD::MUL)
37101 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
37104 EVT VT = N->getValueType(0);
37106 unsigned RegSize = 128;
37107 if (Subtarget.hasBWI())
37109 else if (Subtarget.hasAVX2())
37111 unsigned VectorSize = VT.getVectorNumElements() * 16;
37112 // If the vector size is less than 128, or greater than the supported RegSize,
37113 // do not use PMADD.
37114 if (VectorSize < 128 || VectorSize > RegSize)
37118 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
37119 VT.getVectorNumElements());
37120 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
37121 VT.getVectorNumElements() / 2);
37123 // Shrink the operands of mul.
37124 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
37125 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
37127 // Madd vector size is half of the original vector size
37128 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
37129 // Fill the rest of the output with 0
37130 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
37131 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
37132 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
37135 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
37136 const X86Subtarget &Subtarget) {
37137 if (!Subtarget.hasSSE2())
37141 EVT VT = N->getValueType(0);
37142 SDValue Op0 = N->getOperand(0);
37143 SDValue Op1 = N->getOperand(1);
37145 // TODO: There's nothing special about i32, any integer type above i16 should
37146 // work just as well.
37147 if (!VT.isVector() || !VT.isSimple() ||
37148 !(VT.getVectorElementType() == MVT::i32))
37151 unsigned RegSize = 128;
37152 if (Subtarget.hasBWI())
37154 else if (Subtarget.hasAVX2())
37157 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
37158 // TODO: We should be able to handle larger vectors by splitting them before
37159 // feeding them into several SADs, and then reducing over those.
37160 if (VT.getSizeInBits() / 4 > RegSize)
37163 // We know N is a reduction add, which means one of its operands is a phi.
37164 // To match SAD, we need the other operand to be a vector select.
37165 SDValue SelectOp, Phi;
37166 if (Op0.getOpcode() == ISD::VSELECT) {
37169 } else if (Op1.getOpcode() == ISD::VSELECT) {
37175 // Check whether we have an abs-diff pattern feeding into the select.
37176 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
37179 // SAD pattern detected. Now build a SAD instruction and an addition for
37180 // reduction. Note that the number of elements of the result of SAD is less
37181 // than the number of elements of its input. Therefore, we could only update
37182 // part of elements in the reduction vector.
37183 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
37185 // The output of PSADBW is a vector of i64.
37186 // We need to turn the vector of i64 into a vector of i32.
37187 // If the reduction vector is at least as wide as the psadbw result, just
37188 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
37190 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
37191 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
37192 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
37194 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
37196 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
37197 // Fill the upper elements with zero to match the add width.
37198 SDValue Zero = DAG.getConstant(0, DL, VT);
37199 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
37200 DAG.getIntPtrConstant(0, DL));
37203 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
37206 /// Convert vector increment or decrement to sub/add with an all-ones constant:
37207 /// add X, <1, 1...> --> sub X, <-1, -1...>
37208 /// sub X, <1, 1...> --> add X, <-1, -1...>
37209 /// The all-ones vector constant can be materialized using a pcmpeq instruction
37210 /// that is commonly recognized as an idiom (has no register dependency), so
37211 /// that's better/smaller than loading a splat 1 constant.
37212 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
37213 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
37214 "Unexpected opcode for increment/decrement transform");
37216 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
37217 // out and wait for legalization if we have an unsupported vector length.
37218 EVT VT = N->getValueType(0);
37219 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
37222 SDNode *N1 = N->getOperand(1).getNode();
37224 if (!ISD::isConstantSplatVector(N1, SplatVal) ||
37225 !SplatVal.isOneValue())
37228 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
37229 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
37230 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
37233 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
37234 const X86Subtarget &Subtarget) {
37235 const SDNodeFlags Flags = N->getFlags();
37236 if (Flags.hasVectorReduction()) {
37237 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
37239 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
37242 EVT VT = N->getValueType(0);
37243 SDValue Op0 = N->getOperand(0);
37244 SDValue Op1 = N->getOperand(1);
37246 // Try to synthesize horizontal adds from adds of shuffles.
37247 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
37248 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
37249 isHorizontalBinOp(Op0, Op1, true))
37250 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
37252 if (SDValue V = combineIncDecVector(N, DAG))
37255 return combineAddOrSubToADCOrSBB(N, DAG);
37258 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
37259 const X86Subtarget &Subtarget) {
37260 SDValue Op0 = N->getOperand(0);
37261 SDValue Op1 = N->getOperand(1);
37262 EVT VT = N->getValueType(0);
37264 // PSUBUS is supported, starting from SSE2, but special preprocessing
37265 // for v8i32 requires umin, which appears in SSE41.
37266 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
37267 !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
37268 !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
37269 !(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
37270 (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||
37271 VT == MVT::v8i64)))
37274 SDValue SubusLHS, SubusRHS;
37275 // Try to find umax(a,b) - b or a - umin(a,b) patterns
37276 // they may be converted to subus(a,b).
37277 // TODO: Need to add IR cannonicialization for this code.
37278 if (Op0.getOpcode() == ISD::UMAX) {
37280 SDValue MaxLHS = Op0.getOperand(0);
37281 SDValue MaxRHS = Op0.getOperand(1);
37284 else if (MaxRHS == Op1)
37288 } else if (Op1.getOpcode() == ISD::UMIN) {
37290 SDValue MinLHS = Op1.getOperand(0);
37291 SDValue MinRHS = Op1.getOperand(1);
37294 else if (MinRHS == Op0)
37301 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
37302 // special preprocessing in some cases.
37303 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
37304 return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);
37306 // Special preprocessing case can be only applied
37307 // if the value was zero extended from 16 bit,
37308 // so we require first 16 bits to be zeros for 32 bit
37309 // values, or first 48 bits for 64 bit values.
37311 DAG.computeKnownBits(SubusLHS, Known);
37312 unsigned NumZeros = Known.countMinLeadingZeros();
37313 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
37316 EVT ExtType = SubusLHS.getValueType();
37318 if (VT == MVT::v8i32 || VT == MVT::v8i64)
37319 ShrinkedType = MVT::v8i16;
37321 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
37323 // If SubusLHS is zeroextended - truncate SubusRHS to it's
37324 // size SubusRHS = umin(0xFFF.., SubusRHS).
37325 SDValue SaturationConst =
37326 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
37327 ShrinkedType.getScalarSizeInBits()),
37328 SDLoc(SubusLHS), ExtType);
37329 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
37331 SDValue NewSubusLHS =
37332 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
37333 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
37334 SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
37335 NewSubusLHS, NewSubusRHS);
37336 // Zero extend the result, it may be used somewhere as 32 bit,
37337 // if not zext and following trunc will shrink.
37338 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
37341 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
37342 const X86Subtarget &Subtarget) {
37343 SDValue Op0 = N->getOperand(0);
37344 SDValue Op1 = N->getOperand(1);
37346 // X86 can't encode an immediate LHS of a sub. See if we can push the
37347 // negation into a preceding instruction.
37348 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
37349 // If the RHS of the sub is a XOR with one use and a constant, invert the
37350 // immediate. Then add one to the LHS of the sub so we can turn
37351 // X-Y -> X+~Y+1, saving one register.
37352 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
37353 isa<ConstantSDNode>(Op1.getOperand(1))) {
37354 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
37355 EVT VT = Op0.getValueType();
37356 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
37358 DAG.getConstant(~XorC, SDLoc(Op1), VT));
37359 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
37360 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
37364 // Try to synthesize horizontal subs from subs of shuffles.
37365 EVT VT = N->getValueType(0);
37366 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
37367 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
37368 isHorizontalBinOp(Op0, Op1, false))
37369 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
37371 if (SDValue V = combineIncDecVector(N, DAG))
37374 // Try to create PSUBUS if SUB's argument is max/min
37375 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
37378 return combineAddOrSubToADCOrSBB(N, DAG);
37381 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
37382 TargetLowering::DAGCombinerInfo &DCI,
37383 const X86Subtarget &Subtarget) {
37384 if (DCI.isBeforeLegalize())
37388 unsigned Opcode = N->getOpcode();
37389 MVT VT = N->getSimpleValueType(0);
37390 MVT SVT = VT.getVectorElementType();
37391 unsigned NumElts = VT.getVectorNumElements();
37392 unsigned EltSizeInBits = SVT.getSizeInBits();
37394 SDValue Op = N->getOperand(0);
37395 MVT OpVT = Op.getSimpleValueType();
37396 MVT OpEltVT = OpVT.getVectorElementType();
37397 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
37398 unsigned InputBits = OpEltSizeInBits * NumElts;
37400 // Perform any constant folding.
37401 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
37403 SmallVector<APInt, 64> EltBits;
37404 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
37405 APInt Undefs(NumElts, 0);
37406 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
37408 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
37409 for (unsigned i = 0; i != NumElts; ++i) {
37410 if (UndefElts[i]) {
37414 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
37415 : EltBits[i].sextOrTrunc(EltSizeInBits);
37417 return getConstVector(Vals, Undefs, VT, DAG, DL);
37420 // (vzext (bitcast (vzext (x)) -> (vzext x)
37421 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
37422 SDValue V = peekThroughBitcasts(Op);
37423 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
37424 MVT InnerVT = V.getSimpleValueType();
37425 MVT InnerEltVT = InnerVT.getVectorElementType();
37427 // If the element sizes match exactly, we can just do one larger vzext. This
37428 // is always an exact type match as vzext operates on integer types.
37429 if (OpEltVT == InnerEltVT) {
37430 assert(OpVT == InnerVT && "Types must match for vzext!");
37431 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
37434 // The only other way we can combine them is if only a single element of the
37435 // inner vzext is used in the input to the outer vzext.
37436 if (InnerEltVT.getSizeInBits() < InputBits)
37439 // In this case, the inner vzext is completely dead because we're going to
37440 // only look at bits inside of the low element. Just do the outer vzext on
37441 // a bitcast of the input to the inner.
37442 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
37445 // Check if we can bypass extracting and re-inserting an element of an input
37446 // vector. Essentially:
37447 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
37448 // TODO: Add X86ISD::VSEXT support
37449 if (Opcode == X86ISD::VZEXT &&
37450 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37451 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
37452 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
37453 SDValue ExtractedV = V.getOperand(0);
37454 SDValue OrigV = ExtractedV.getOperand(0);
37455 if (isNullConstant(ExtractedV.getOperand(1))) {
37456 MVT OrigVT = OrigV.getSimpleValueType();
37457 // Extract a subvector if necessary...
37458 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
37459 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
37460 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
37461 OrigVT.getVectorNumElements() / Ratio);
37462 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
37463 DAG.getIntPtrConstant(0, DL));
37465 Op = DAG.getBitcast(OpVT, OrigV);
37466 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
37473 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
37474 const X86Subtarget &Subtarget) {
37475 SDValue Op0 = N->getOperand(0);
37476 SDValue Op1 = N->getOperand(1);
37478 MVT VT = N->getSimpleValueType(0);
37481 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
37482 if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)
37483 return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
37484 Op0->getOperand(1));
37486 // TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
37487 // TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
37488 if (ISD::isBuildVectorAllZeros(Op0.getNode()) ||
37489 ISD::isBuildVectorAllZeros(Op1.getNode()))
37490 return getZeroVector(VT, Subtarget, DAG, DL);
37495 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
37496 const X86Subtarget &Subtarget) {
37497 MVT VT = N->getSimpleValueType(0);
37500 if (N->getOperand(0) == N->getOperand(1)) {
37501 if (N->getOpcode() == X86ISD::PCMPEQ)
37502 return getOnesVector(VT, DAG, DL);
37503 if (N->getOpcode() == X86ISD::PCMPGT)
37504 return getZeroVector(VT, Subtarget, DAG, DL);
37510 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
37511 TargetLowering::DAGCombinerInfo &DCI,
37512 const X86Subtarget &Subtarget) {
37513 if (DCI.isBeforeLegalizeOps())
37516 MVT OpVT = N->getSimpleValueType(0);
37518 // Early out for mask vectors.
37519 if (OpVT.getVectorElementType() == MVT::i1)
37523 SDValue Vec = N->getOperand(0);
37524 SDValue SubVec = N->getOperand(1);
37526 unsigned IdxVal = N->getConstantOperandVal(2);
37527 MVT SubVecVT = SubVec.getSimpleValueType();
37529 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
37530 // Inserting zeros into zeros is a nop.
37531 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
37534 // If we're inserting into a zero vector and then into a larger zero vector,
37535 // just insert into the larger zero vector directly.
37536 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
37537 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
37538 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
37539 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
37540 SubVec.getOperand(1),
37541 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
37544 // If we're inserting a bitcast into zeros, rewrite the insert and move the
37545 // bitcast to the other side. This helps with detecting zero extending
37547 // TODO: Is this useful for other indices than 0?
37548 if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
37549 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
37550 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
37551 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
37552 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
37553 DAG.getBitcast(NewVT, Vec),
37554 SubVec.getOperand(0), N->getOperand(2));
37555 return DAG.getBitcast(OpVT, Insert);
37559 // If this is an insert of an extract, combine to a shuffle. Don't do this
37560 // if the insert or extract can be represented with a subregister operation.
37561 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
37562 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
37563 (IdxVal != 0 || !Vec.isUndef())) {
37564 int ExtIdxVal = SubVec.getConstantOperandVal(1);
37565 if (ExtIdxVal != 0) {
37566 int VecNumElts = OpVT.getVectorNumElements();
37567 int SubVecNumElts = SubVecVT.getVectorNumElements();
37568 SmallVector<int, 64> Mask(VecNumElts);
37569 // First create an identity shuffle mask.
37570 for (int i = 0; i != VecNumElts; ++i)
37572 // Now insert the extracted portion.
37573 for (int i = 0; i != SubVecNumElts; ++i)
37574 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
37576 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
37580 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
37582 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
37583 // (load16 addr + 16), Elts/2)
37586 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
37587 // (load32 addr + 32), Elts/2)
37589 // or a 16-byte or 32-byte broadcast:
37590 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
37591 // (load16 addr), Elts/2)
37592 // --> X86SubVBroadcast(load16 addr)
37594 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
37595 // (load32 addr), Elts/2)
37596 // --> X86SubVBroadcast(load32 addr)
37597 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
37598 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
37599 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
37600 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
37601 if (Idx2 && Idx2->getZExtValue() == 0) {
37602 SDValue SubVec2 = Vec.getOperand(1);
37603 // If needed, look through bitcasts to get to the load.
37604 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
37606 unsigned Alignment = FirstLd->getAlignment();
37607 unsigned AS = FirstLd->getAddressSpace();
37608 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
37609 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
37610 OpVT, AS, Alignment, &Fast) && Fast) {
37611 SDValue Ops[] = {SubVec2, SubVec};
37612 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
37617 // If lower/upper loads are the same and the only users of the load, then
37618 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
37619 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
37620 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
37621 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
37622 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
37624 // If this is subv_broadcast insert into both halves, use a larger
37626 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
37627 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
37628 SubVec.getOperand(0));
37630 // If we're inserting all zeros into the upper half, change this to
37631 // an insert into an all zeros vector. We will match this to a move
37632 // with implicit upper bit zeroing during isel.
37633 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
37634 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
37635 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
37636 Vec.getOperand(2));
37638 // If we are inserting into both halves of the vector, the starting
37639 // vector should be undef. If it isn't, make it so. Only do this if the
37640 // the early insert has no other uses.
37641 // TODO: Should this be a generic DAG combine?
37642 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
37643 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
37644 SubVec2, Vec.getOperand(2));
37645 DCI.AddToWorklist(Vec.getNode());
37646 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
37656 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
37657 TargetLowering::DAGCombinerInfo &DCI,
37658 const X86Subtarget &Subtarget) {
37659 if (DCI.isBeforeLegalizeOps())
37662 MVT OpVT = N->getSimpleValueType(0);
37663 SDValue InVec = N->getOperand(0);
37664 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
37666 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
37667 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
37669 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
37670 if (OpVT.getScalarType() == MVT::i1)
37671 return DAG.getConstant(1, SDLoc(N), OpVT);
37672 return getOnesVector(OpVT, DAG, SDLoc(N));
37675 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
37676 return DAG.getBuildVector(
37678 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
37683 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
37684 DAGCombinerInfo &DCI) const {
37685 SelectionDAG &DAG = DCI.DAG;
37686 switch (N->getOpcode()) {
37688 case ISD::EXTRACT_VECTOR_ELT:
37689 case X86ISD::PEXTRW:
37690 case X86ISD::PEXTRB:
37691 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
37692 case ISD::INSERT_SUBVECTOR:
37693 return combineInsertSubvector(N, DAG, DCI, Subtarget);
37694 case ISD::EXTRACT_SUBVECTOR:
37695 return combineExtractSubvector(N, DAG, DCI, Subtarget);
37698 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
37699 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
37700 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
37701 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
37702 case ISD::SUB: return combineSub(N, DAG, Subtarget);
37703 case X86ISD::SBB: return combineSBB(N, DAG);
37704 case X86ISD::ADC: return combineADC(N, DAG, DCI);
37705 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
37708 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
37709 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
37710 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
37711 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
37712 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
37713 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
37714 case ISD::STORE: return combineStore(N, DAG, Subtarget);
37715 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
37716 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
37717 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
37719 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
37720 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
37721 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
37722 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
37723 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
37724 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
37726 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
37728 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
37730 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
37731 case X86ISD::BT: return combineBT(N, DAG, DCI);
37732 case ISD::ANY_EXTEND:
37733 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
37734 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
37735 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
37736 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
37737 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
37738 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
37739 case X86ISD::PACKSS:
37740 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
37741 case X86ISD::VSHLI:
37742 case X86ISD::VSRAI:
37743 case X86ISD::VSRLI:
37744 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
37745 case ISD::SIGN_EXTEND_VECTOR_INREG:
37746 case ISD::ZERO_EXTEND_VECTOR_INREG:
37747 case X86ISD::VSEXT:
37748 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
37749 case X86ISD::PINSRB:
37750 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
37751 case X86ISD::SHUFP: // Handle all target specific shuffles
37752 case X86ISD::INSERTPS:
37753 case X86ISD::EXTRQI:
37754 case X86ISD::INSERTQI:
37755 case X86ISD::PALIGNR:
37756 case X86ISD::VSHLDQ:
37757 case X86ISD::VSRLDQ:
37758 case X86ISD::BLENDI:
37759 case X86ISD::UNPCKH:
37760 case X86ISD::UNPCKL:
37761 case X86ISD::MOVHLPS:
37762 case X86ISD::MOVLHPS:
37763 case X86ISD::PSHUFB:
37764 case X86ISD::PSHUFD:
37765 case X86ISD::PSHUFHW:
37766 case X86ISD::PSHUFLW:
37767 case X86ISD::MOVSHDUP:
37768 case X86ISD::MOVSLDUP:
37769 case X86ISD::MOVDDUP:
37770 case X86ISD::MOVSS:
37771 case X86ISD::MOVSD:
37772 case X86ISD::VBROADCAST:
37773 case X86ISD::VPPERM:
37774 case X86ISD::VPERMI:
37775 case X86ISD::VPERMV:
37776 case X86ISD::VPERMV3:
37777 case X86ISD::VPERMIV3:
37778 case X86ISD::VPERMIL2:
37779 case X86ISD::VPERMILPI:
37780 case X86ISD::VPERMILPV:
37781 case X86ISD::VPERM2X128:
37782 case X86ISD::VZEXT_MOVL:
37783 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
37784 case X86ISD::FMADD_RND:
37785 case X86ISD::FMADDS1_RND:
37786 case X86ISD::FMADDS3_RND:
37787 case X86ISD::FMADDS1:
37788 case X86ISD::FMADDS3:
37789 case X86ISD::FMADD4S:
37790 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
37791 case X86ISD::FMADDSUB_RND:
37792 case X86ISD::FMSUBADD_RND:
37793 case X86ISD::FMADDSUB:
37794 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
37795 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
37796 case X86ISD::MGATHER:
37797 case X86ISD::MSCATTER:
37799 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
37800 case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget);
37801 case X86ISD::PCMPEQ:
37802 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
37808 /// Return true if the target has native support for the specified value type
37809 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
37810 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
37811 /// some i16 instructions are slow.
37812 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
37813 if (!isTypeLegal(VT))
37815 if (VT != MVT::i16)
37822 case ISD::SIGN_EXTEND:
37823 case ISD::ZERO_EXTEND:
37824 case ISD::ANY_EXTEND:
37837 /// This method query the target whether it is beneficial for dag combiner to
37838 /// promote the specified node. If true, it should return the desired promotion
37839 /// type by reference.
37840 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
37841 EVT VT = Op.getValueType();
37842 if (VT != MVT::i16)
37845 bool Promote = false;
37846 bool Commute = false;
37847 switch (Op.getOpcode()) {
37849 case ISD::SIGN_EXTEND:
37850 case ISD::ZERO_EXTEND:
37851 case ISD::ANY_EXTEND:
37856 SDValue N0 = Op.getOperand(0);
37857 // Look out for (store (shl (load), x)).
37858 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
37871 SDValue N0 = Op.getOperand(0);
37872 SDValue N1 = Op.getOperand(1);
37873 if (!Commute && MayFoldLoad(N1))
37875 // Avoid disabling potential load folding opportunities.
37876 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
37878 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
37888 bool X86TargetLowering::
37889 isDesirableToCombineBuildVectorToShuffleTruncate(
37890 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
37892 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
37893 "Element count mismatch");
37895 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
37896 "Shuffle Mask expected to be legal");
37898 // For 32-bit elements VPERMD is better than shuffle+truncate.
37899 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
37900 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
37903 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
37909 //===----------------------------------------------------------------------===//
37910 // X86 Inline Assembly Support
37911 //===----------------------------------------------------------------------===//
37913 // Helper to match a string separated by whitespace.
37914 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
37915 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
37917 for (StringRef Piece : Pieces) {
37918 if (!S.startswith(Piece)) // Check if the piece matches.
37921 S = S.substr(Piece.size());
37922 StringRef::size_type Pos = S.find_first_not_of(" \t");
37923 if (Pos == 0) // We matched a prefix.
37932 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
37934 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
37935 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
37936 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
37937 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
37939 if (AsmPieces.size() == 3)
37941 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
37948 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
37949 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
37951 const std::string &AsmStr = IA->getAsmString();
37953 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
37954 if (!Ty || Ty->getBitWidth() % 16 != 0)
37957 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
37958 SmallVector<StringRef, 4> AsmPieces;
37959 SplitString(AsmStr, AsmPieces, ";\n");
37961 switch (AsmPieces.size()) {
37962 default: return false;
37964 // FIXME: this should verify that we are targeting a 486 or better. If not,
37965 // we will turn this bswap into something that will be lowered to logical
37966 // ops instead of emitting the bswap asm. For now, we don't support 486 or
37967 // lower so don't worry about this.
37969 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
37970 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
37971 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
37972 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
37973 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
37974 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
37975 // No need to check constraints, nothing other than the equivalent of
37976 // "=r,0" would be valid here.
37977 return IntrinsicLowering::LowerToByteSwap(CI);
37980 // rorw $$8, ${0:w} --> llvm.bswap.i16
37981 if (CI->getType()->isIntegerTy(16) &&
37982 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
37983 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
37984 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
37986 StringRef ConstraintsStr = IA->getConstraintString();
37987 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
37988 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
37989 if (clobbersFlagRegisters(AsmPieces))
37990 return IntrinsicLowering::LowerToByteSwap(CI);
37994 if (CI->getType()->isIntegerTy(32) &&
37995 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
37996 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
37997 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
37998 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
38000 StringRef ConstraintsStr = IA->getConstraintString();
38001 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
38002 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
38003 if (clobbersFlagRegisters(AsmPieces))
38004 return IntrinsicLowering::LowerToByteSwap(CI);
38007 if (CI->getType()->isIntegerTy(64)) {
38008 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
38009 if (Constraints.size() >= 2 &&
38010 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
38011 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
38012 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
38013 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
38014 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
38015 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
38016 return IntrinsicLowering::LowerToByteSwap(CI);
38024 /// Given a constraint letter, return the type of constraint for this target.
38025 X86TargetLowering::ConstraintType
38026 X86TargetLowering::getConstraintType(StringRef Constraint) const {
38027 if (Constraint.size() == 1) {
38028 switch (Constraint[0]) {
38040 case 'k': // AVX512 masking registers.
38041 return C_RegisterClass;
38065 else if (Constraint.size() == 2) {
38066 switch (Constraint[0]) {
38070 switch (Constraint[1]) {
38081 return C_RegisterClass;
38085 return TargetLowering::getConstraintType(Constraint);
38088 /// Examine constraint type and operand type and determine a weight value.
38089 /// This object must already have been set up with the operand type
38090 /// and the current alternative constraint selected.
38091 TargetLowering::ConstraintWeight
38092 X86TargetLowering::getSingleConstraintMatchWeight(
38093 AsmOperandInfo &info, const char *constraint) const {
38094 ConstraintWeight weight = CW_Invalid;
38095 Value *CallOperandVal = info.CallOperandVal;
38096 // If we don't have a value, we can't do a match,
38097 // but allow it at the lowest weight.
38098 if (!CallOperandVal)
38100 Type *type = CallOperandVal->getType();
38101 // Look at the constraint type.
38102 switch (*constraint) {
38104 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
38116 if (CallOperandVal->getType()->isIntegerTy())
38117 weight = CW_SpecificReg;
38122 if (type->isFloatingPointTy())
38123 weight = CW_SpecificReg;
38126 if (type->isX86_MMXTy() && Subtarget.hasMMX())
38127 weight = CW_SpecificReg;
38130 unsigned Size = StringRef(constraint).size();
38131 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
38132 char NextChar = Size == 2 ? constraint[1] : 'i';
38135 switch (NextChar) {
38141 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
38142 return CW_SpecificReg;
38144 // Conditional OpMask regs (AVX512)
38146 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
38147 return CW_Register;
38151 if (type->isX86_MMXTy() && Subtarget.hasMMX())
38154 // Any SSE reg when ISA >= SSE2, same as 'Y'
38158 if (!Subtarget.hasSSE2())
38162 // Fall through (handle "Y" constraint).
38166 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
38167 weight = CW_Register;
38170 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
38171 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
38172 weight = CW_Register;
38175 // Enable conditional vector operations using %k<#> registers.
38176 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
38177 weight = CW_Register;
38180 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
38181 if (C->getZExtValue() <= 31)
38182 weight = CW_Constant;
38186 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38187 if (C->getZExtValue() <= 63)
38188 weight = CW_Constant;
38192 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38193 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
38194 weight = CW_Constant;
38198 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38199 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
38200 weight = CW_Constant;
38204 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38205 if (C->getZExtValue() <= 3)
38206 weight = CW_Constant;
38210 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38211 if (C->getZExtValue() <= 0xff)
38212 weight = CW_Constant;
38217 if (isa<ConstantFP>(CallOperandVal)) {
38218 weight = CW_Constant;
38222 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38223 if ((C->getSExtValue() >= -0x80000000LL) &&
38224 (C->getSExtValue() <= 0x7fffffffLL))
38225 weight = CW_Constant;
38229 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38230 if (C->getZExtValue() <= 0xffffffff)
38231 weight = CW_Constant;
38238 /// Try to replace an X constraint, which matches anything, with another that
38239 /// has more specific requirements based on the type of the corresponding
38241 const char *X86TargetLowering::
38242 LowerXConstraint(EVT ConstraintVT) const {
38243 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
38244 // 'f' like normal targets.
38245 if (ConstraintVT.isFloatingPoint()) {
38246 if (Subtarget.hasSSE2())
38248 if (Subtarget.hasSSE1())
38252 return TargetLowering::LowerXConstraint(ConstraintVT);
38255 /// Lower the specified operand into the Ops vector.
38256 /// If it is invalid, don't add anything to Ops.
38257 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
38258 std::string &Constraint,
38259 std::vector<SDValue>&Ops,
38260 SelectionDAG &DAG) const {
38263 // Only support length 1 constraints for now.
38264 if (Constraint.length() > 1) return;
38266 char ConstraintLetter = Constraint[0];
38267 switch (ConstraintLetter) {
38270 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38271 if (C->getZExtValue() <= 31) {
38272 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38273 Op.getValueType());
38279 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38280 if (C->getZExtValue() <= 63) {
38281 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38282 Op.getValueType());
38288 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38289 if (isInt<8>(C->getSExtValue())) {
38290 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38291 Op.getValueType());
38297 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38298 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
38299 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
38300 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
38301 Op.getValueType());
38307 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38308 if (C->getZExtValue() <= 3) {
38309 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38310 Op.getValueType());
38316 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38317 if (C->getZExtValue() <= 255) {
38318 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38319 Op.getValueType());
38325 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38326 if (C->getZExtValue() <= 127) {
38327 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38328 Op.getValueType());
38334 // 32-bit signed value
38335 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38336 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
38337 C->getSExtValue())) {
38338 // Widen to 64 bits here to get it sign extended.
38339 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
38342 // FIXME gcc accepts some relocatable values here too, but only in certain
38343 // memory models; it's complicated.
38348 // 32-bit unsigned value
38349 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38350 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
38351 C->getZExtValue())) {
38352 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38353 Op.getValueType());
38357 // FIXME gcc accepts some relocatable values here too, but only in certain
38358 // memory models; it's complicated.
38362 // Literal immediates are always ok.
38363 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
38364 // Widen to 64 bits here to get it sign extended.
38365 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
38369 // In any sort of PIC mode addresses need to be computed at runtime by
38370 // adding in a register or some sort of table lookup. These can't
38371 // be used as immediates.
38372 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
38375 // If we are in non-pic codegen mode, we allow the address of a global (with
38376 // an optional displacement) to be used with 'i'.
38377 GlobalAddressSDNode *GA = nullptr;
38378 int64_t Offset = 0;
38380 // Match either (GA), (GA+C), (GA+C1+C2), etc.
38382 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
38383 Offset += GA->getOffset();
38385 } else if (Op.getOpcode() == ISD::ADD) {
38386 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
38387 Offset += C->getZExtValue();
38388 Op = Op.getOperand(0);
38391 } else if (Op.getOpcode() == ISD::SUB) {
38392 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
38393 Offset += -C->getZExtValue();
38394 Op = Op.getOperand(0);
38399 // Otherwise, this isn't something we can handle, reject it.
38403 const GlobalValue *GV = GA->getGlobal();
38404 // If we require an extra load to get this address, as in PIC mode, we
38405 // can't accept it.
38406 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
38409 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
38410 GA->getValueType(0), Offset);
38415 if (Result.getNode()) {
38416 Ops.push_back(Result);
38419 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
38422 /// Check if \p RC is a general purpose register class.
38423 /// I.e., GR* or one of their variant.
38424 static bool isGRClass(const TargetRegisterClass &RC) {
38425 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
38426 RC.hasSuperClassEq(&X86::GR16RegClass) ||
38427 RC.hasSuperClassEq(&X86::GR32RegClass) ||
38428 RC.hasSuperClassEq(&X86::GR64RegClass) ||
38429 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
38432 /// Check if \p RC is a vector register class.
38433 /// I.e., FR* / VR* or one of their variant.
38434 static bool isFRClass(const TargetRegisterClass &RC) {
38435 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
38436 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
38437 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
38438 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
38439 RC.hasSuperClassEq(&X86::VR512RegClass);
38442 std::pair<unsigned, const TargetRegisterClass *>
38443 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
38444 StringRef Constraint,
38446 // First, see if this is a constraint that directly corresponds to an LLVM
38448 if (Constraint.size() == 1) {
38449 // GCC Constraint Letters
38450 switch (Constraint[0]) {
38452 // TODO: Slight differences here in allocation order and leaving
38453 // RIP in the class. Do they matter any more here than they do
38454 // in the normal allocation?
38456 if (Subtarget.hasAVX512()) {
38457 // Only supported in AVX512 or later.
38458 switch (VT.SimpleTy) {
38461 return std::make_pair(0U, &X86::VK32RegClass);
38463 return std::make_pair(0U, &X86::VK16RegClass);
38465 return std::make_pair(0U, &X86::VK8RegClass);
38467 return std::make_pair(0U, &X86::VK1RegClass);
38469 return std::make_pair(0U, &X86::VK64RegClass);
38473 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
38474 if (Subtarget.is64Bit()) {
38475 if (VT == MVT::i32 || VT == MVT::f32)
38476 return std::make_pair(0U, &X86::GR32RegClass);
38477 if (VT == MVT::i16)
38478 return std::make_pair(0U, &X86::GR16RegClass);
38479 if (VT == MVT::i8 || VT == MVT::i1)
38480 return std::make_pair(0U, &X86::GR8RegClass);
38481 if (VT == MVT::i64 || VT == MVT::f64)
38482 return std::make_pair(0U, &X86::GR64RegClass);
38486 // 32-bit fallthrough
38487 case 'Q': // Q_REGS
38488 if (VT == MVT::i32 || VT == MVT::f32)
38489 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
38490 if (VT == MVT::i16)
38491 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
38492 if (VT == MVT::i8 || VT == MVT::i1)
38493 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
38494 if (VT == MVT::i64)
38495 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
38497 case 'r': // GENERAL_REGS
38498 case 'l': // INDEX_REGS
38499 if (VT == MVT::i8 || VT == MVT::i1)
38500 return std::make_pair(0U, &X86::GR8RegClass);
38501 if (VT == MVT::i16)
38502 return std::make_pair(0U, &X86::GR16RegClass);
38503 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
38504 return std::make_pair(0U, &X86::GR32RegClass);
38505 return std::make_pair(0U, &X86::GR64RegClass);
38506 case 'R': // LEGACY_REGS
38507 if (VT == MVT::i8 || VT == MVT::i1)
38508 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
38509 if (VT == MVT::i16)
38510 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
38511 if (VT == MVT::i32 || !Subtarget.is64Bit())
38512 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
38513 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
38514 case 'f': // FP Stack registers.
38515 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
38516 // value to the correct fpstack register class.
38517 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
38518 return std::make_pair(0U, &X86::RFP32RegClass);
38519 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
38520 return std::make_pair(0U, &X86::RFP64RegClass);
38521 return std::make_pair(0U, &X86::RFP80RegClass);
38522 case 'y': // MMX_REGS if MMX allowed.
38523 if (!Subtarget.hasMMX()) break;
38524 return std::make_pair(0U, &X86::VR64RegClass);
38525 case 'Y': // SSE_REGS if SSE2 allowed
38526 if (!Subtarget.hasSSE2()) break;
38529 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
38530 if (!Subtarget.hasSSE1()) break;
38531 bool VConstraint = (Constraint[0] == 'v');
38533 switch (VT.SimpleTy) {
38535 // Scalar SSE types.
38538 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
38539 return std::make_pair(0U, &X86::FR32XRegClass);
38540 return std::make_pair(0U, &X86::FR32RegClass);
38543 if (VConstraint && Subtarget.hasVLX())
38544 return std::make_pair(0U, &X86::FR64XRegClass);
38545 return std::make_pair(0U, &X86::FR64RegClass);
38546 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
38554 if (VConstraint && Subtarget.hasVLX())
38555 return std::make_pair(0U, &X86::VR128XRegClass);
38556 return std::make_pair(0U, &X86::VR128RegClass);
38564 if (VConstraint && Subtarget.hasVLX())
38565 return std::make_pair(0U, &X86::VR256XRegClass);
38566 return std::make_pair(0U, &X86::VR256RegClass);
38571 return std::make_pair(0U, &X86::VR512RegClass);
38575 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
38576 switch (Constraint[1]) {
38582 return getRegForInlineAsmConstraint(TRI, "Y", VT);
38584 if (!Subtarget.hasMMX()) break;
38585 return std::make_pair(0U, &X86::VR64RegClass);
38588 if (!Subtarget.hasSSE1()) break;
38589 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
38591 // This register class doesn't allocate k0 for masked vector operation.
38592 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
38593 switch (VT.SimpleTy) {
38596 return std::make_pair(0U, &X86::VK32WMRegClass);
38598 return std::make_pair(0U, &X86::VK16WMRegClass);
38600 return std::make_pair(0U, &X86::VK8WMRegClass);
38602 return std::make_pair(0U, &X86::VK1WMRegClass);
38604 return std::make_pair(0U, &X86::VK64WMRegClass);
38611 // Use the default implementation in TargetLowering to convert the register
38612 // constraint into a member of a register class.
38613 std::pair<unsigned, const TargetRegisterClass*> Res;
38614 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
38616 // Not found as a standard register?
38618 // Map st(0) -> st(7) -> ST0
38619 if (Constraint.size() == 7 && Constraint[0] == '{' &&
38620 tolower(Constraint[1]) == 's' &&
38621 tolower(Constraint[2]) == 't' &&
38622 Constraint[3] == '(' &&
38623 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
38624 Constraint[5] == ')' &&
38625 Constraint[6] == '}') {
38627 Res.first = X86::FP0+Constraint[4]-'0';
38628 Res.second = &X86::RFP80RegClass;
38632 // GCC allows "st(0)" to be called just plain "st".
38633 if (StringRef("{st}").equals_lower(Constraint)) {
38634 Res.first = X86::FP0;
38635 Res.second = &X86::RFP80RegClass;
38640 if (StringRef("{flags}").equals_lower(Constraint)) {
38641 Res.first = X86::EFLAGS;
38642 Res.second = &X86::CCRRegClass;
38646 // 'A' means [ER]AX + [ER]DX.
38647 if (Constraint == "A") {
38648 if (Subtarget.is64Bit()) {
38649 Res.first = X86::RAX;
38650 Res.second = &X86::GR64_ADRegClass;
38652 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
38653 "Expecting 64, 32 or 16 bit subtarget");
38654 Res.first = X86::EAX;
38655 Res.second = &X86::GR32_ADRegClass;
38662 // Otherwise, check to see if this is a register class of the wrong value
38663 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
38664 // turn into {ax},{dx}.
38665 // MVT::Other is used to specify clobber names.
38666 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
38667 return Res; // Correct type already, nothing to do.
38669 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
38670 // return "eax". This should even work for things like getting 64bit integer
38671 // registers when given an f64 type.
38672 const TargetRegisterClass *Class = Res.second;
38673 // The generic code will match the first register class that contains the
38674 // given register. Thus, based on the ordering of the tablegened file,
38675 // the "plain" GR classes might not come first.
38676 // Therefore, use a helper method.
38677 if (isGRClass(*Class)) {
38678 unsigned Size = VT.getSizeInBits();
38679 if (Size == 1) Size = 8;
38680 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
38682 bool is64Bit = Subtarget.is64Bit();
38683 const TargetRegisterClass *RC =
38684 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
38685 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
38686 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
38687 : &X86::GR64RegClass;
38688 if (RC->contains(DestReg))
38689 Res = std::make_pair(DestReg, RC);
38691 // No register found/type mismatch.
38693 Res.second = nullptr;
38695 } else if (isFRClass(*Class)) {
38696 // Handle references to XMM physical registers that got mapped into the
38697 // wrong class. This can happen with constraints like {xmm0} where the
38698 // target independent register mapper will just pick the first match it can
38699 // find, ignoring the required type.
38701 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
38702 if (VT == MVT::f32 || VT == MVT::i32)
38703 Res.second = &X86::FR32RegClass;
38704 else if (VT == MVT::f64 || VT == MVT::i64)
38705 Res.second = &X86::FR64RegClass;
38706 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
38707 Res.second = &X86::VR128RegClass;
38708 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
38709 Res.second = &X86::VR256RegClass;
38710 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
38711 Res.second = &X86::VR512RegClass;
38713 // Type mismatch and not a clobber: Return an error;
38715 Res.second = nullptr;
38722 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
38723 const AddrMode &AM, Type *Ty,
38724 unsigned AS) const {
38725 // Scaling factors are not free at all.
38726 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
38727 // will take 2 allocations in the out of order engine instead of 1
38728 // for plain addressing mode, i.e. inst (reg1).
38730 // vaddps (%rsi,%drx), %ymm0, %ymm1
38731 // Requires two allocations (one for the load, one for the computation)
38733 // vaddps (%rsi), %ymm0, %ymm1
38734 // Requires just 1 allocation, i.e., freeing allocations for other operations
38735 // and having less micro operations to execute.
38737 // For some X86 architectures, this is even worse because for instance for
38738 // stores, the complex addressing mode forces the instruction to use the
38739 // "load" ports instead of the dedicated "store" port.
38740 // E.g., on Haswell:
38741 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
38742 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
38743 if (isLegalAddressingMode(DL, AM, Ty, AS))
38744 // Scale represents reg2 * scale, thus account for 1
38745 // as soon as we use a second register.
38746 return AM.Scale != 0;
38750 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
38751 // Integer division on x86 is expensive. However, when aggressively optimizing
38752 // for code size, we prefer to use a div instruction, as it is usually smaller
38753 // than the alternative sequence.
38754 // The exception to this is vector division. Since x86 doesn't have vector
38755 // integer division, leaving the division as-is is a loss even in terms of
38756 // size, because it will have to be scalarized, while the alternative code
38757 // sequence can be performed in vector form.
38759 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
38760 return OptSize && !VT.isVector();
38763 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
38764 if (!Subtarget.is64Bit())
38767 // Update IsSplitCSR in X86MachineFunctionInfo.
38768 X86MachineFunctionInfo *AFI =
38769 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
38770 AFI->setIsSplitCSR(true);
38773 void X86TargetLowering::insertCopiesSplitCSR(
38774 MachineBasicBlock *Entry,
38775 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
38776 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38777 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
38781 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
38782 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
38783 MachineBasicBlock::iterator MBBI = Entry->begin();
38784 for (const MCPhysReg *I = IStart; *I; ++I) {
38785 const TargetRegisterClass *RC = nullptr;
38786 if (X86::GR64RegClass.contains(*I))
38787 RC = &X86::GR64RegClass;
38789 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
38791 unsigned NewVR = MRI->createVirtualRegister(RC);
38792 // Create copy from CSR to a virtual register.
38793 // FIXME: this currently does not emit CFI pseudo-instructions, it works
38794 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
38795 // nounwind. If we want to generalize this later, we may need to emit
38796 // CFI pseudo-instructions.
38797 assert(Entry->getParent()->getFunction().hasFnAttribute(
38798 Attribute::NoUnwind) &&
38799 "Function should be nounwind in insertCopiesSplitCSR!");
38800 Entry->addLiveIn(*I);
38801 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
38804 // Insert the copy-back instructions right before the terminator.
38805 for (auto *Exit : Exits)
38806 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
38807 TII->get(TargetOpcode::COPY), *I)
38812 bool X86TargetLowering::supportSwiftError() const {
38813 return Subtarget.is64Bit();
38816 /// Returns the name of the symbol used to emit stack probes or the empty
38817 /// string if not applicable.
38818 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
38819 // If the function specifically requests stack probes, emit them.
38820 if (MF.getFunction().hasFnAttribute("probe-stack"))
38821 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
38823 // Generally, if we aren't on Windows, the platform ABI does not include
38824 // support for stack probes, so don't emit them.
38825 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
38828 // We need a stack probe to conform to the Windows ABI. Choose the right
38830 if (Subtarget.is64Bit())
38831 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
38832 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";