1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/WinEHFuncInfo.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/ErrorHandling.h"
57 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Support/MathExtras.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
192 if (Subtarget.hasCMov()) {
193 setOperationAction(ISD::ABS , MVT::i16 , Custom);
194 setOperationAction(ISD::ABS , MVT::i32 , Custom);
195 if (Subtarget.is64Bit())
196 setOperationAction(ISD::ABS , MVT::i64 , Custom);
199 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
201 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
205 if (Subtarget.is64Bit()) {
206 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207 // f32/f64 are legal, f80 is custom.
208 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
211 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
212 } else if (!Subtarget.useSoftFloat()) {
213 // We have an algorithm for SSE2->double, and we turn this into a
214 // 64-bit FILD followed by conditional FADD for other targets.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
216 // We have an algorithm for SSE2, and we turn this into a 64-bit
217 // FILD or VCVTUSI2SS/SD for other targets.
218 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
221 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
223 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
224 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
226 if (!Subtarget.useSoftFloat()) {
227 // SSE has no i16 to fp conversion, only i32.
228 if (X86ScalarSSEf32) {
229 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
230 // f32 and f64 cases are Legal, f80 case is not
231 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
233 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
234 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
237 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
238 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
241 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
243 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
244 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
246 if (!Subtarget.useSoftFloat()) {
247 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
248 // are Legal, f80 is custom lowered.
249 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
250 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
252 if (X86ScalarSSEf32) {
253 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
254 // f32 and f64 cases are Legal, f80 case is not
255 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
257 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
258 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
261 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
262 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
263 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
266 // Handle FP_TO_UINT by promoting the destination to a larger signed
268 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
269 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
270 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
272 if (Subtarget.is64Bit()) {
273 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
274 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
275 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
276 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
278 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
279 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
281 } else if (!Subtarget.useSoftFloat()) {
282 // Since AVX is a superset of SSE3, only check for SSE here.
283 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
284 // Expand FP_TO_UINT into a select.
285 // FIXME: We would like to use a Custom expander here eventually to do
286 // the optimal thing for SSE vs. the default expansion in the legalizer.
287 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
289 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
290 // With SSE3 we can use fisttpll to convert to a signed i64; without
291 // SSE, we're stuck with a fistpll.
292 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
294 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
297 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
298 if (!X86ScalarSSEf64) {
299 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
300 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
301 if (Subtarget.is64Bit()) {
302 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
303 // Without SSE, i64->f64 goes through memory.
304 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
306 } else if (!Subtarget.is64Bit())
307 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
309 // Scalar integer divide and remainder are lowered to use operations that
310 // produce two results, to match the available instructions. This exposes
311 // the two-result form to trivial CSE, which is able to combine x/y and x%y
312 // into a single instruction.
314 // Scalar integer multiply-high is also lowered to use two-result
315 // operations, to match the available instructions. However, plain multiply
316 // (low) operations are left as Legal, as there are single-result
317 // instructions for this in x86. Using the two-result multiply instructions
318 // when both high and low results are needed must be arranged by dagcombine.
319 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
320 setOperationAction(ISD::MULHS, VT, Expand);
321 setOperationAction(ISD::MULHU, VT, Expand);
322 setOperationAction(ISD::SDIV, VT, Expand);
323 setOperationAction(ISD::UDIV, VT, Expand);
324 setOperationAction(ISD::SREM, VT, Expand);
325 setOperationAction(ISD::UREM, VT, Expand);
328 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
329 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
330 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
331 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
332 setOperationAction(ISD::BR_CC, VT, Expand);
333 setOperationAction(ISD::SELECT_CC, VT, Expand);
335 if (Subtarget.is64Bit())
336 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
337 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
340 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
342 setOperationAction(ISD::FREM , MVT::f32 , Expand);
343 setOperationAction(ISD::FREM , MVT::f64 , Expand);
344 setOperationAction(ISD::FREM , MVT::f80 , Expand);
345 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
347 // Promote the i8 variants and force them on up to i32 which has a shorter
349 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
350 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
351 if (!Subtarget.hasBMI()) {
352 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
353 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
354 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
355 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
356 if (Subtarget.is64Bit()) {
357 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
358 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
362 if (Subtarget.hasLZCNT()) {
363 // When promoting the i8 variants, force them to i32 for a shorter
365 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
366 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
368 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
369 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
370 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
371 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
372 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
374 if (Subtarget.is64Bit()) {
375 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
376 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
380 // Special handling for half-precision floating point conversions.
381 // If we don't have F16C support, then lower half float conversions
382 // into library calls.
383 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
384 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
385 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
388 // There's never any support for operations beyond MVT::f32.
389 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
390 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
391 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
392 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
394 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
395 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
396 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
397 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
398 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
401 if (Subtarget.hasPOPCNT()) {
402 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
404 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
405 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
406 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
407 if (Subtarget.is64Bit())
408 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
411 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
413 if (!Subtarget.hasMOVBE())
414 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
416 // These should be promoted to a larger select which is supported.
417 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
418 // X86 wants to expand cmov itself.
419 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
420 setOperationAction(ISD::SELECT, VT, Custom);
421 setOperationAction(ISD::SETCC, VT, Custom);
423 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
424 if (VT == MVT::i64 && !Subtarget.is64Bit())
426 setOperationAction(ISD::SELECT, VT, Custom);
427 setOperationAction(ISD::SETCC, VT, Custom);
430 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
431 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
432 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
434 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
435 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
436 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
437 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
438 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
439 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
440 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
441 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
444 for (auto VT : { MVT::i32, MVT::i64 }) {
445 if (VT == MVT::i64 && !Subtarget.is64Bit())
447 setOperationAction(ISD::ConstantPool , VT, Custom);
448 setOperationAction(ISD::JumpTable , VT, Custom);
449 setOperationAction(ISD::GlobalAddress , VT, Custom);
450 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
451 setOperationAction(ISD::ExternalSymbol , VT, Custom);
452 setOperationAction(ISD::BlockAddress , VT, Custom);
455 // 64-bit shl, sra, srl (iff 32-bit x86)
456 for (auto VT : { MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
459 setOperationAction(ISD::SHL_PARTS, VT, Custom);
460 setOperationAction(ISD::SRA_PARTS, VT, Custom);
461 setOperationAction(ISD::SRL_PARTS, VT, Custom);
464 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
465 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
467 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
469 // Expand certain atomics
470 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
471 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
472 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
473 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
477 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
480 if (Subtarget.hasCmpxchg16b()) {
481 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
484 // FIXME - use subtarget debug flags
485 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
486 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
487 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
488 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
491 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
492 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
494 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
495 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
497 setOperationAction(ISD::TRAP, MVT::Other, Legal);
498 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
500 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
501 setOperationAction(ISD::VASTART , MVT::Other, Custom);
502 setOperationAction(ISD::VAEND , MVT::Other, Expand);
503 bool Is64Bit = Subtarget.is64Bit();
504 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
505 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
507 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
508 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
510 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
512 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
513 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
514 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
516 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
517 // f32 and f64 use SSE.
518 // Set up the FP register classes.
519 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
520 : &X86::FR32RegClass);
521 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
522 : &X86::FR64RegClass);
524 for (auto VT : { MVT::f32, MVT::f64 }) {
525 // Use ANDPD to simulate FABS.
526 setOperationAction(ISD::FABS, VT, Custom);
528 // Use XORP to simulate FNEG.
529 setOperationAction(ISD::FNEG, VT, Custom);
531 // Use ANDPD and ORPD to simulate FCOPYSIGN.
532 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
534 // We don't support sin/cos/fmod
535 setOperationAction(ISD::FSIN , VT, Expand);
536 setOperationAction(ISD::FCOS , VT, Expand);
537 setOperationAction(ISD::FSINCOS, VT, Expand);
540 // Lower this to MOVMSK plus an AND.
541 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
542 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
544 // Expand FP immediates into loads from the stack, except for the special
546 addLegalFPImmediate(APFloat(+0.0)); // xorpd
547 addLegalFPImmediate(APFloat(+0.0f)); // xorps
548 } else if (UseX87 && X86ScalarSSEf32) {
549 // Use SSE for f32, x87 for f64.
550 // Set up the FP register classes.
551 addRegisterClass(MVT::f32, &X86::FR32RegClass);
552 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
554 // Use ANDPS to simulate FABS.
555 setOperationAction(ISD::FABS , MVT::f32, Custom);
557 // Use XORP to simulate FNEG.
558 setOperationAction(ISD::FNEG , MVT::f32, Custom);
560 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
562 // Use ANDPS and ORPS to simulate FCOPYSIGN.
563 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
564 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
566 // We don't support sin/cos/fmod
567 setOperationAction(ISD::FSIN , MVT::f32, Expand);
568 setOperationAction(ISD::FCOS , MVT::f32, Expand);
569 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
571 // Special cases we handle for FP constants.
572 addLegalFPImmediate(APFloat(+0.0f)); // xorps
573 addLegalFPImmediate(APFloat(+0.0)); // FLD0
574 addLegalFPImmediate(APFloat(+1.0)); // FLD1
575 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
576 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
578 // Always expand sin/cos functions even though x87 has an instruction.
579 setOperationAction(ISD::FSIN , MVT::f64, Expand);
580 setOperationAction(ISD::FCOS , MVT::f64, Expand);
581 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
583 // f32 and f64 in x87.
584 // Set up the FP register classes.
585 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
586 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
588 for (auto VT : { MVT::f32, MVT::f64 }) {
589 setOperationAction(ISD::UNDEF, VT, Expand);
590 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
592 // Always expand sin/cos functions even though x87 has an instruction.
593 setOperationAction(ISD::FSIN , VT, Expand);
594 setOperationAction(ISD::FCOS , VT, Expand);
595 setOperationAction(ISD::FSINCOS, VT, Expand);
597 addLegalFPImmediate(APFloat(+0.0)); // FLD0
598 addLegalFPImmediate(APFloat(+1.0)); // FLD1
599 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
600 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
601 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
602 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
603 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
604 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
607 // We don't support FMA.
608 setOperationAction(ISD::FMA, MVT::f64, Expand);
609 setOperationAction(ISD::FMA, MVT::f32, Expand);
611 // Long double always uses X87, except f128 in MMX.
613 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
614 addRegisterClass(MVT::f128, &X86::FR128RegClass);
615 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
616 setOperationAction(ISD::FABS , MVT::f128, Custom);
617 setOperationAction(ISD::FNEG , MVT::f128, Custom);
618 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
621 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
622 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
623 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
625 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
626 addLegalFPImmediate(TmpFlt); // FLD0
628 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
631 APFloat TmpFlt2(+1.0);
632 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
634 addLegalFPImmediate(TmpFlt2); // FLD1
635 TmpFlt2.changeSign();
636 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
639 // Always expand sin/cos functions even though x87 has an instruction.
640 setOperationAction(ISD::FSIN , MVT::f80, Expand);
641 setOperationAction(ISD::FCOS , MVT::f80, Expand);
642 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
644 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
645 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
646 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
647 setOperationAction(ISD::FRINT, MVT::f80, Expand);
648 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
649 setOperationAction(ISD::FMA, MVT::f80, Expand);
652 // Always use a library call for pow.
653 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
654 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
655 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
657 setOperationAction(ISD::FLOG, MVT::f80, Expand);
658 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
659 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
660 setOperationAction(ISD::FEXP, MVT::f80, Expand);
661 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
662 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
663 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
665 // Some FP actions are always expanded for vector types.
666 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
667 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
668 setOperationAction(ISD::FSIN, VT, Expand);
669 setOperationAction(ISD::FSINCOS, VT, Expand);
670 setOperationAction(ISD::FCOS, VT, Expand);
671 setOperationAction(ISD::FREM, VT, Expand);
672 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
673 setOperationAction(ISD::FPOW, VT, Expand);
674 setOperationAction(ISD::FLOG, VT, Expand);
675 setOperationAction(ISD::FLOG2, VT, Expand);
676 setOperationAction(ISD::FLOG10, VT, Expand);
677 setOperationAction(ISD::FEXP, VT, Expand);
678 setOperationAction(ISD::FEXP2, VT, Expand);
681 // First set operation action for all vector types to either promote
682 // (for widening) or expand (for scalarization). Then we will selectively
683 // turn on ones that can be effectively codegen'd.
684 for (MVT VT : MVT::vector_valuetypes()) {
685 setOperationAction(ISD::SDIV, VT, Expand);
686 setOperationAction(ISD::UDIV, VT, Expand);
687 setOperationAction(ISD::SREM, VT, Expand);
688 setOperationAction(ISD::UREM, VT, Expand);
689 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
690 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
691 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
692 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
693 setOperationAction(ISD::FMA, VT, Expand);
694 setOperationAction(ISD::FFLOOR, VT, Expand);
695 setOperationAction(ISD::FCEIL, VT, Expand);
696 setOperationAction(ISD::FTRUNC, VT, Expand);
697 setOperationAction(ISD::FRINT, VT, Expand);
698 setOperationAction(ISD::FNEARBYINT, VT, Expand);
699 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
700 setOperationAction(ISD::MULHS, VT, Expand);
701 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHU, VT, Expand);
703 setOperationAction(ISD::SDIVREM, VT, Expand);
704 setOperationAction(ISD::UDIVREM, VT, Expand);
705 setOperationAction(ISD::CTPOP, VT, Expand);
706 setOperationAction(ISD::CTTZ, VT, Expand);
707 setOperationAction(ISD::CTLZ, VT, Expand);
708 setOperationAction(ISD::ROTL, VT, Expand);
709 setOperationAction(ISD::ROTR, VT, Expand);
710 setOperationAction(ISD::BSWAP, VT, Expand);
711 setOperationAction(ISD::SETCC, VT, Expand);
712 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
713 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
714 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
715 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
716 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
717 setOperationAction(ISD::TRUNCATE, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
719 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
720 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
721 setOperationAction(ISD::SELECT_CC, VT, Expand);
722 for (MVT InnerVT : MVT::vector_valuetypes()) {
723 setTruncStoreAction(InnerVT, VT, Expand);
725 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
726 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
728 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
729 // types, we have to deal with them whether we ask for Expansion or not.
730 // Setting Expand causes its own optimisation problems though, so leave
732 if (VT.getVectorElementType() == MVT::i1)
733 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
735 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
736 // split/scalarized right now.
737 if (VT.getVectorElementType() == MVT::f16)
738 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
742 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
743 // with -msoft-float, disable use of MMX as well.
744 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
745 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
746 // No operations on x86mmx supported, everything uses intrinsics.
749 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
750 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
751 : &X86::VR128RegClass);
753 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
754 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
755 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
756 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
757 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
758 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
759 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
760 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
764 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
765 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
766 : &X86::VR128RegClass);
768 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
769 // registers cannot be used even for integer operations.
770 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
771 : &X86::VR128RegClass);
772 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
779 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
780 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
781 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
782 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
783 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
784 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
785 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
786 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
787 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
788 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
789 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
790 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
791 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
793 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
794 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
795 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
796 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
798 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
799 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
800 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
802 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
803 setOperationAction(ISD::SETCC, VT, Custom);
804 setOperationAction(ISD::CTPOP, VT, Custom);
805 setOperationAction(ISD::CTTZ, VT, Custom);
808 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
809 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
810 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
811 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
812 setOperationAction(ISD::VSELECT, VT, Custom);
813 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
816 // We support custom legalizing of sext and anyext loads for specific
817 // memory vector types which we can load as a scalar (or sequence of
818 // scalars) and extend in-register to a legal 128-bit vector type. For sext
819 // loads these must work with a single scalar load.
820 for (MVT VT : MVT::integer_vector_valuetypes()) {
821 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
822 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
823 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
824 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
825 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
826 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
827 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
828 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
829 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
832 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
833 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
834 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
835 setOperationAction(ISD::VSELECT, VT, Custom);
837 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
840 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
841 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
844 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
845 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
846 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
847 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
848 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
849 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
850 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
853 // Custom lower v2i64 and v2f64 selects.
854 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
855 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
857 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
858 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
860 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
861 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
863 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
865 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
866 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
868 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
869 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
871 for (MVT VT : MVT::fp_vector_valuetypes())
872 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
874 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
875 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
876 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
878 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
879 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
880 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
882 // In the customized shift lowering, the legal v4i32/v2i64 cases
883 // in AVX2 will be recognized.
884 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
885 setOperationAction(ISD::SRL, VT, Custom);
886 setOperationAction(ISD::SHL, VT, Custom);
887 setOperationAction(ISD::SRA, VT, Custom);
891 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
892 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
893 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
894 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
895 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
896 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
897 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
898 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
899 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
902 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
903 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
904 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
905 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
906 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
907 setOperationAction(ISD::FRINT, RoundedTy, Legal);
908 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
911 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
912 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
913 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
914 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
915 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
916 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
917 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
918 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
920 // FIXME: Do we need to handle scalar-to-vector here?
921 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
923 // We directly match byte blends in the backend as they match the VSELECT
925 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
927 // SSE41 brings specific instructions for doing vector sign extend even in
928 // cases where we don't have SRA.
929 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
930 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
931 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
934 for (MVT VT : MVT::integer_vector_valuetypes()) {
935 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
936 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
937 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
940 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
941 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
942 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
943 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
944 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
945 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
946 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
947 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
948 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
951 // i8 vectors are custom because the source register and source
952 // source memory operand types are not the same width.
953 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
956 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
957 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
958 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
959 setOperationAction(ISD::ROTL, VT, Custom);
961 // XOP can efficiently perform BITREVERSE with VPPERM.
962 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
963 setOperationAction(ISD::BITREVERSE, VT, Custom);
965 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
966 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
967 setOperationAction(ISD::BITREVERSE, VT, Custom);
970 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
971 bool HasInt256 = Subtarget.hasInt256();
973 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
974 : &X86::VR256RegClass);
975 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
976 : &X86::VR256RegClass);
977 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
978 : &X86::VR256RegClass);
979 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
980 : &X86::VR256RegClass);
981 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
982 : &X86::VR256RegClass);
983 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
984 : &X86::VR256RegClass);
986 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
987 setOperationAction(ISD::FFLOOR, VT, Legal);
988 setOperationAction(ISD::FCEIL, VT, Legal);
989 setOperationAction(ISD::FTRUNC, VT, Legal);
990 setOperationAction(ISD::FRINT, VT, Legal);
991 setOperationAction(ISD::FNEARBYINT, VT, Legal);
992 setOperationAction(ISD::FNEG, VT, Custom);
993 setOperationAction(ISD::FABS, VT, Custom);
994 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
997 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
998 // even though v8i16 is a legal type.
999 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1000 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1001 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1003 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1004 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1006 for (MVT VT : MVT::fp_vector_valuetypes())
1007 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1009 // In the customized shift lowering, the legal v8i32/v4i64 cases
1010 // in AVX2 will be recognized.
1011 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1012 setOperationAction(ISD::SRL, VT, Custom);
1013 setOperationAction(ISD::SHL, VT, Custom);
1014 setOperationAction(ISD::SRA, VT, Custom);
1017 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1018 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1021 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1022 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1023 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1024 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1027 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1028 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1029 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1030 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1032 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1033 setOperationAction(ISD::SETCC, VT, Custom);
1034 setOperationAction(ISD::CTPOP, VT, Custom);
1035 setOperationAction(ISD::CTTZ, VT, Custom);
1036 setOperationAction(ISD::CTLZ, VT, Custom);
1039 if (Subtarget.hasAnyFMA()) {
1040 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1041 MVT::v2f64, MVT::v4f64 })
1042 setOperationAction(ISD::FMA, VT, Legal);
1045 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1046 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1047 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1050 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1051 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1052 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1053 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1055 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1056 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1058 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1059 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1060 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1061 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1063 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1064 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1065 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1066 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1067 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1068 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1072 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1073 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1074 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1076 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1077 // when we have a 256bit-wide blend with immediate.
1078 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1080 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1081 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1082 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1083 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1084 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1085 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1086 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1087 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1091 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1092 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1093 setOperationAction(ISD::MLOAD, VT, Legal);
1094 setOperationAction(ISD::MSTORE, VT, Legal);
1097 // Extract subvector is special because the value type
1098 // (result) is 128-bit but the source is 256-bit wide.
1099 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1100 MVT::v4f32, MVT::v2f64 }) {
1101 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1104 // Custom lower several nodes for 256-bit types.
1105 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1106 MVT::v8f32, MVT::v4f64 }) {
1107 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1108 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1109 setOperationAction(ISD::VSELECT, VT, Custom);
1110 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1111 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1112 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1113 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1114 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1118 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1120 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1121 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1122 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1123 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1124 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1125 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1126 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1130 // Custom legalize 2x32 to get a little better code.
1131 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1132 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1134 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1135 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1136 setOperationAction(ISD::MGATHER, VT, Custom);
1140 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1141 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1142 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1143 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1144 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1146 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1147 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1148 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1150 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1151 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1152 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1154 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i1, MVT::v16i32);
1155 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i1, MVT::v16i32);
1156 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i1, MVT::v8i32);
1157 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i1, MVT::v8i32);
1158 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i1, MVT::v4i32);
1159 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i1, MVT::v4i32);
1160 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1161 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1163 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
1164 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
1165 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1166 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1167 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1168 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1169 if (Subtarget.hasVLX()) {
1170 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1171 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1174 // Extends of v16i1/v8i1 to 128-bit vectors.
1175 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1176 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom);
1177 setOperationAction(ISD::ANY_EXTEND, MVT::v16i8, Custom);
1178 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1179 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom);
1180 setOperationAction(ISD::ANY_EXTEND, MVT::v8i16, Custom);
1182 for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1183 setOperationAction(ISD::ADD, VT, Custom);
1184 setOperationAction(ISD::SUB, VT, Custom);
1185 setOperationAction(ISD::MUL, VT, Custom);
1186 setOperationAction(ISD::SETCC, VT, Custom);
1187 setOperationAction(ISD::SELECT, VT, Custom);
1188 setOperationAction(ISD::TRUNCATE, VT, Custom);
1190 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1191 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1192 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1193 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1194 setOperationAction(ISD::VSELECT, VT, Expand);
1197 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1198 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1199 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1200 for (auto VT : { MVT::v1i1, MVT::v8i1 })
1201 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1203 for (MVT VT : MVT::fp_vector_valuetypes())
1204 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1206 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1207 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1208 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1209 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1210 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1211 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1214 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1215 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1216 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1217 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1218 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1219 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1220 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1221 setTruncStoreAction(VT, MaskVT, Custom);
1224 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1225 setOperationAction(ISD::FNEG, VT, Custom);
1226 setOperationAction(ISD::FABS, VT, Custom);
1227 setOperationAction(ISD::FMA, VT, Legal);
1228 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1231 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1232 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
1233 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
1234 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1235 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
1236 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
1237 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1238 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1240 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1241 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1242 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1243 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1244 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1246 if (!Subtarget.hasVLX()) {
1247 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1248 // to 512-bit rather than use the AVX2 instructions so that we can use
1250 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1251 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1252 setOperationAction(ISD::MLOAD, VT, Custom);
1253 setOperationAction(ISD::MSTORE, VT, Custom);
1257 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1258 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1259 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1260 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1261 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1262 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1263 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1264 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1266 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1267 setOperationAction(ISD::FFLOOR, VT, Legal);
1268 setOperationAction(ISD::FCEIL, VT, Legal);
1269 setOperationAction(ISD::FTRUNC, VT, Legal);
1270 setOperationAction(ISD::FRINT, VT, Legal);
1271 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1274 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1275 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1277 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1278 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1279 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1281 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1282 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1283 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1284 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1286 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1287 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1289 setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
1290 setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
1292 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1293 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1294 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1296 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1297 setOperationAction(ISD::SMAX, VT, Legal);
1298 setOperationAction(ISD::UMAX, VT, Legal);
1299 setOperationAction(ISD::SMIN, VT, Legal);
1300 setOperationAction(ISD::UMIN, VT, Legal);
1301 setOperationAction(ISD::ABS, VT, Legal);
1302 setOperationAction(ISD::SRL, VT, Custom);
1303 setOperationAction(ISD::SHL, VT, Custom);
1304 setOperationAction(ISD::SRA, VT, Custom);
1305 setOperationAction(ISD::CTPOP, VT, Custom);
1306 setOperationAction(ISD::CTTZ, VT, Custom);
1307 setOperationAction(ISD::ROTL, VT, Custom);
1308 setOperationAction(ISD::ROTR, VT, Custom);
1311 // Need to promote to 64-bit even though we have 32-bit masked instructions
1312 // because the IR optimizers rearrange bitcasts around logic ops leaving
1313 // too many variations to handle if we don't promote them.
1314 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1315 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1316 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1318 if (Subtarget.hasDQI()) {
1319 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1320 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1321 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1322 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1325 if (Subtarget.hasCDI()) {
1326 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1327 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1328 setOperationAction(ISD::CTLZ, VT, Legal);
1329 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1331 } // Subtarget.hasCDI()
1333 if (Subtarget.hasVPOPCNTDQ()) {
1334 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1335 setOperationAction(ISD::CTPOP, VT, Legal);
1338 // Extract subvector is special because the value type
1339 // (result) is 256-bit but the source is 512-bit wide.
1340 // 128-bit was made Legal under AVX1.
1341 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1342 MVT::v8f32, MVT::v4f64 })
1343 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1345 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1346 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1347 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1348 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1349 setOperationAction(ISD::VSELECT, VT, Custom);
1350 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1351 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1352 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1353 setOperationAction(ISD::MLOAD, VT, Legal);
1354 setOperationAction(ISD::MSTORE, VT, Legal);
1355 setOperationAction(ISD::MGATHER, VT, Custom);
1356 setOperationAction(ISD::MSCATTER, VT, Custom);
1358 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1359 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1360 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1364 if (!Subtarget.useSoftFloat() &&
1365 (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
1366 // These operations are handled on non-VLX by artificially widening in
1368 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1370 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1371 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1372 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1373 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1374 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1376 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1377 setOperationAction(ISD::SMAX, VT, Legal);
1378 setOperationAction(ISD::UMAX, VT, Legal);
1379 setOperationAction(ISD::SMIN, VT, Legal);
1380 setOperationAction(ISD::UMIN, VT, Legal);
1381 setOperationAction(ISD::ABS, VT, Legal);
1384 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1385 setOperationAction(ISD::ROTL, VT, Custom);
1386 setOperationAction(ISD::ROTR, VT, Custom);
1389 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1390 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1391 setOperationAction(ISD::MSCATTER, VT, Custom);
1393 if (Subtarget.hasDQI()) {
1394 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1395 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1396 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1397 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1398 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1402 if (Subtarget.hasCDI()) {
1403 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1404 setOperationAction(ISD::CTLZ, VT, Legal);
1405 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1407 } // Subtarget.hasCDI()
1409 if (Subtarget.hasVPOPCNTDQ()) {
1410 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1411 setOperationAction(ISD::CTPOP, VT, Legal);
1415 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1416 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1417 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1419 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1420 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1422 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1423 setOperationAction(ISD::ADD, VT, Custom);
1424 setOperationAction(ISD::SUB, VT, Custom);
1425 setOperationAction(ISD::MUL, VT, Custom);
1426 setOperationAction(ISD::VSELECT, VT, Expand);
1428 setOperationAction(ISD::TRUNCATE, VT, Custom);
1429 setOperationAction(ISD::SETCC, VT, Custom);
1430 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1431 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1432 setOperationAction(ISD::SELECT, VT, Custom);
1433 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1434 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1437 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1438 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1439 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1440 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1441 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1442 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1444 // Extends from v32i1 masks to 256-bit vectors.
1445 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1446 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1447 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1448 // Extends from v64i1 masks to 512-bit vectors.
1449 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1450 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1451 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1453 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1454 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1455 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1456 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1457 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1458 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1459 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1460 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1461 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1462 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1463 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1464 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1465 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1466 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1467 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1468 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1469 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1470 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1471 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1472 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1473 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1474 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1475 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1477 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1479 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1481 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1482 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1483 setOperationAction(ISD::VSELECT, VT, Custom);
1484 setOperationAction(ISD::ABS, VT, Legal);
1485 setOperationAction(ISD::SRL, VT, Custom);
1486 setOperationAction(ISD::SHL, VT, Custom);
1487 setOperationAction(ISD::SRA, VT, Custom);
1488 setOperationAction(ISD::MLOAD, VT, Legal);
1489 setOperationAction(ISD::MSTORE, VT, Legal);
1490 setOperationAction(ISD::CTPOP, VT, Custom);
1491 setOperationAction(ISD::CTTZ, VT, Custom);
1492 setOperationAction(ISD::CTLZ, VT, Custom);
1493 setOperationAction(ISD::SMAX, VT, Legal);
1494 setOperationAction(ISD::UMAX, VT, Legal);
1495 setOperationAction(ISD::SMIN, VT, Legal);
1496 setOperationAction(ISD::UMIN, VT, Legal);
1498 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1499 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1500 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1503 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1504 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1507 if (Subtarget.hasBITALG()) {
1508 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1509 setOperationAction(ISD::CTPOP, VT, Legal);
1513 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() &&
1514 (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
1515 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1516 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1517 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1520 // These operations are handled on non-VLX by artificially widening in
1522 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1524 if (Subtarget.hasBITALG()) {
1525 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1526 setOperationAction(ISD::CTPOP, VT, Legal);
1530 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1531 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1532 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1534 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1535 setOperationAction(ISD::ADD, VT, Custom);
1536 setOperationAction(ISD::SUB, VT, Custom);
1537 setOperationAction(ISD::MUL, VT, Custom);
1538 setOperationAction(ISD::VSELECT, VT, Expand);
1540 setOperationAction(ISD::TRUNCATE, VT, Custom);
1541 setOperationAction(ISD::SETCC, VT, Custom);
1542 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1543 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1544 setOperationAction(ISD::SELECT, VT, Custom);
1545 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1546 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1549 // TODO: v8i1 concat should be legal without VLX to support concats of
1550 // v1i1, but we won't legalize it correctly currently without introducing
1551 // a v4i1 concat in the middle.
1552 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1553 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1554 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1555 for (auto VT : { MVT::v2i1, MVT::v4i1 })
1556 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1558 // Extends from v2i1/v4i1 masks to 128-bit vectors.
1559 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1560 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1561 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1562 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1563 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Custom);
1564 setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Custom);
1566 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1567 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1568 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1569 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1570 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1572 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1573 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1574 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1575 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1576 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1578 if (Subtarget.hasDQI()) {
1579 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1580 // v2f32 UINT_TO_FP is already custom under SSE2.
1581 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1582 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1583 "Unexpected operation action!");
1584 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1585 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1586 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1589 if (Subtarget.hasBWI()) {
1590 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1591 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1595 // We want to custom lower some of our intrinsics.
1596 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1597 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1598 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1599 if (!Subtarget.is64Bit()) {
1600 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1601 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1604 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1605 // handle type legalization for these operations here.
1607 // FIXME: We really should do custom legalization for addition and
1608 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1609 // than generic legalization for 64-bit multiplication-with-overflow, though.
1610 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1611 if (VT == MVT::i64 && !Subtarget.is64Bit())
1613 // Add/Sub/Mul with overflow operations are custom lowered.
1614 setOperationAction(ISD::SADDO, VT, Custom);
1615 setOperationAction(ISD::UADDO, VT, Custom);
1616 setOperationAction(ISD::SSUBO, VT, Custom);
1617 setOperationAction(ISD::USUBO, VT, Custom);
1618 setOperationAction(ISD::SMULO, VT, Custom);
1619 setOperationAction(ISD::UMULO, VT, Custom);
1621 // Support carry in as value rather than glue.
1622 setOperationAction(ISD::ADDCARRY, VT, Custom);
1623 setOperationAction(ISD::SUBCARRY, VT, Custom);
1624 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1627 if (!Subtarget.is64Bit()) {
1628 // These libcalls are not available in 32-bit.
1629 setLibcallName(RTLIB::SHL_I128, nullptr);
1630 setLibcallName(RTLIB::SRL_I128, nullptr);
1631 setLibcallName(RTLIB::SRA_I128, nullptr);
1632 setLibcallName(RTLIB::MUL_I128, nullptr);
1635 // Combine sin / cos into _sincos_stret if it is available.
1636 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1637 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1638 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1639 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1642 if (Subtarget.isTargetWin64()) {
1643 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1644 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1645 setOperationAction(ISD::SREM, MVT::i128, Custom);
1646 setOperationAction(ISD::UREM, MVT::i128, Custom);
1647 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1648 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1651 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1652 // is. We should promote the value to 64-bits to solve this.
1653 // This is what the CRT headers do - `fmodf` is an inline header
1654 // function casting to f64 and calling `fmod`.
1655 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1656 Subtarget.isTargetWindowsItanium()))
1657 for (ISD::NodeType Op :
1658 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1659 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1660 if (isOperationExpand(Op, MVT::f32))
1661 setOperationAction(Op, MVT::f32, Promote);
1663 // We have target-specific dag combine patterns for the following nodes:
1664 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1665 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1666 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1667 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1668 setTargetDAGCombine(ISD::BITCAST);
1669 setTargetDAGCombine(ISD::VSELECT);
1670 setTargetDAGCombine(ISD::SELECT);
1671 setTargetDAGCombine(ISD::SHL);
1672 setTargetDAGCombine(ISD::SRA);
1673 setTargetDAGCombine(ISD::SRL);
1674 setTargetDAGCombine(ISD::OR);
1675 setTargetDAGCombine(ISD::AND);
1676 setTargetDAGCombine(ISD::ADD);
1677 setTargetDAGCombine(ISD::FADD);
1678 setTargetDAGCombine(ISD::FSUB);
1679 setTargetDAGCombine(ISD::FNEG);
1680 setTargetDAGCombine(ISD::FMA);
1681 setTargetDAGCombine(ISD::FMINNUM);
1682 setTargetDAGCombine(ISD::FMAXNUM);
1683 setTargetDAGCombine(ISD::SUB);
1684 setTargetDAGCombine(ISD::LOAD);
1685 setTargetDAGCombine(ISD::MLOAD);
1686 setTargetDAGCombine(ISD::STORE);
1687 setTargetDAGCombine(ISD::MSTORE);
1688 setTargetDAGCombine(ISD::TRUNCATE);
1689 setTargetDAGCombine(ISD::ZERO_EXTEND);
1690 setTargetDAGCombine(ISD::ANY_EXTEND);
1691 setTargetDAGCombine(ISD::SIGN_EXTEND);
1692 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1693 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1694 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1695 setTargetDAGCombine(ISD::SINT_TO_FP);
1696 setTargetDAGCombine(ISD::UINT_TO_FP);
1697 setTargetDAGCombine(ISD::SETCC);
1698 setTargetDAGCombine(ISD::MUL);
1699 setTargetDAGCombine(ISD::XOR);
1700 setTargetDAGCombine(ISD::MSCATTER);
1701 setTargetDAGCombine(ISD::MGATHER);
1703 computeRegisterProperties(Subtarget.getRegisterInfo());
1705 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1706 MaxStoresPerMemsetOptSize = 8;
1707 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1708 MaxStoresPerMemcpyOptSize = 4;
1709 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1710 MaxStoresPerMemmoveOptSize = 4;
1712 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1713 // that needs to benchmarked and balanced with the potential use of vector
1714 // load/store types (PR33329, PR33914).
1715 MaxLoadsPerMemcmp = 2;
1716 MaxLoadsPerMemcmpOptSize = 2;
1718 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1719 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1721 // An out-of-order CPU can speculatively execute past a predictable branch,
1722 // but a conditional move could be stalled by an expensive earlier operation.
1723 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1724 EnableExtLdPromotion = true;
1725 setPrefFunctionAlignment(4); // 2^4 bytes.
1727 verifyIntrinsicTables();
1730 // This has so far only been implemented for 64-bit MachO.
1731 bool X86TargetLowering::useLoadStackGuardNode() const {
1732 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1735 bool X86TargetLowering::useStackGuardXorFP() const {
1736 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1737 return Subtarget.getTargetTriple().isOSMSVCRT();
1740 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1741 const SDLoc &DL) const {
1742 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1743 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1744 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1745 return SDValue(Node, 0);
1748 TargetLoweringBase::LegalizeTypeAction
1749 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1750 if (ExperimentalVectorWideningLegalization &&
1751 VT.getVectorNumElements() != 1 &&
1752 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1753 return TypeWidenVector;
1755 return TargetLoweringBase::getPreferredVectorAction(VT);
1758 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1759 LLVMContext& Context,
1764 if (Subtarget.hasAVX512()) {
1765 const unsigned NumElts = VT.getVectorNumElements();
1767 // Figure out what this type will be legalized to.
1769 while (getTypeAction(Context, LegalVT) != TypeLegal)
1770 LegalVT = getTypeToTransformTo(Context, LegalVT);
1772 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1773 if (LegalVT.getSimpleVT().is512BitVector())
1774 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1776 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1777 // If we legalized to less than a 512-bit vector, then we will use a vXi1
1778 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1780 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1781 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1782 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1786 return VT.changeVectorElementTypeToInteger();
1789 /// Helper for getByValTypeAlignment to determine
1790 /// the desired ByVal argument alignment.
1791 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1794 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1795 if (VTy->getBitWidth() == 128)
1797 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1798 unsigned EltAlign = 0;
1799 getMaxByValAlign(ATy->getElementType(), EltAlign);
1800 if (EltAlign > MaxAlign)
1801 MaxAlign = EltAlign;
1802 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1803 for (auto *EltTy : STy->elements()) {
1804 unsigned EltAlign = 0;
1805 getMaxByValAlign(EltTy, EltAlign);
1806 if (EltAlign > MaxAlign)
1807 MaxAlign = EltAlign;
1814 /// Return the desired alignment for ByVal aggregate
1815 /// function arguments in the caller parameter area. For X86, aggregates
1816 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1817 /// are at 4-byte boundaries.
1818 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1819 const DataLayout &DL) const {
1820 if (Subtarget.is64Bit()) {
1821 // Max of 8 and alignment of type.
1822 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1829 if (Subtarget.hasSSE1())
1830 getMaxByValAlign(Ty, Align);
1834 /// Returns the target specific optimal type for load
1835 /// and store operations as a result of memset, memcpy, and memmove
1836 /// lowering. If DstAlign is zero that means it's safe to destination
1837 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1838 /// means there isn't a need to check it against alignment requirement,
1839 /// probably because the source does not need to be loaded. If 'IsMemset' is
1840 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1841 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1842 /// source is constant so it does not need to be loaded.
1843 /// It returns EVT::Other if the type should be determined using generic
1844 /// target-independent logic.
1846 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1847 unsigned DstAlign, unsigned SrcAlign,
1848 bool IsMemset, bool ZeroMemset,
1850 MachineFunction &MF) const {
1851 const Function &F = MF.getFunction();
1852 if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
1854 (!Subtarget.isUnalignedMem16Slow() ||
1855 ((DstAlign == 0 || DstAlign >= 16) &&
1856 (SrcAlign == 0 || SrcAlign >= 16)))) {
1857 // FIXME: Check if unaligned 32-byte accesses are slow.
1858 if (Size >= 32 && Subtarget.hasAVX()) {
1859 // Although this isn't a well-supported type for AVX1, we'll let
1860 // legalization and shuffle lowering produce the optimal codegen. If we
1861 // choose an optimal type with a vector element larger than a byte,
1862 // getMemsetStores() may create an intermediate splat (using an integer
1863 // multiply) before we splat as a vector.
1866 if (Subtarget.hasSSE2())
1868 // TODO: Can SSE1 handle a byte vector?
1869 if (Subtarget.hasSSE1())
1871 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1872 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1873 // Do not use f64 to lower memcpy if source is string constant. It's
1874 // better to use i32 to avoid the loads.
1875 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1876 // The gymnastics of splatting a byte value into an XMM register and then
1877 // only using 8-byte stores (because this is a CPU with slow unaligned
1878 // 16-byte accesses) makes that a loser.
1882 // This is a compromise. If we reach here, unaligned accesses may be slow on
1883 // this target. However, creating smaller, aligned accesses could be even
1884 // slower and would certainly be a lot more code.
1885 if (Subtarget.is64Bit() && Size >= 8)
1890 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1892 return X86ScalarSSEf32;
1893 else if (VT == MVT::f64)
1894 return X86ScalarSSEf64;
1899 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1904 switch (VT.getSizeInBits()) {
1906 // 8-byte and under are always assumed to be fast.
1910 *Fast = !Subtarget.isUnalignedMem16Slow();
1913 *Fast = !Subtarget.isUnalignedMem32Slow();
1915 // TODO: What about AVX-512 (512-bit) accesses?
1918 // Misaligned accesses of any size are always allowed.
1922 /// Return the entry encoding for a jump table in the
1923 /// current function. The returned value is a member of the
1924 /// MachineJumpTableInfo::JTEntryKind enum.
1925 unsigned X86TargetLowering::getJumpTableEncoding() const {
1926 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1928 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1929 return MachineJumpTableInfo::EK_Custom32;
1931 // Otherwise, use the normal jump table encoding heuristics.
1932 return TargetLowering::getJumpTableEncoding();
1935 bool X86TargetLowering::useSoftFloat() const {
1936 return Subtarget.useSoftFloat();
1939 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1940 ArgListTy &Args) const {
1942 // Only relabel X86-32 for C / Stdcall CCs.
1943 if (Subtarget.is64Bit())
1945 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1947 unsigned ParamRegs = 0;
1948 if (auto *M = MF->getFunction().getParent())
1949 ParamRegs = M->getNumberRegisterParameters();
1951 // Mark the first N int arguments as having reg
1952 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1953 Type *T = Args[Idx].Ty;
1954 if (T->isPointerTy() || T->isIntegerTy())
1955 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1956 unsigned numRegs = 1;
1957 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1959 if (ParamRegs < numRegs)
1961 ParamRegs -= numRegs;
1962 Args[Idx].IsInReg = true;
1968 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1969 const MachineBasicBlock *MBB,
1970 unsigned uid,MCContext &Ctx) const{
1971 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1972 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1974 return MCSymbolRefExpr::create(MBB->getSymbol(),
1975 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1978 /// Returns relocation base for the given PIC jumptable.
1979 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1980 SelectionDAG &DAG) const {
1981 if (!Subtarget.is64Bit())
1982 // This doesn't have SDLoc associated with it, but is not really the
1983 // same as a Register.
1984 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1985 getPointerTy(DAG.getDataLayout()));
1989 /// This returns the relocation base for the given PIC jumptable,
1990 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1991 const MCExpr *X86TargetLowering::
1992 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1993 MCContext &Ctx) const {
1994 // X86-64 uses RIP relative addressing based on the jump table label.
1995 if (Subtarget.isPICStyleRIPRel())
1996 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1998 // Otherwise, the reference is relative to the PIC base.
1999 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2002 std::pair<const TargetRegisterClass *, uint8_t>
2003 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2005 const TargetRegisterClass *RRC = nullptr;
2007 switch (VT.SimpleTy) {
2009 return TargetLowering::findRepresentativeClass(TRI, VT);
2010 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2011 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2014 RRC = &X86::VR64RegClass;
2016 case MVT::f32: case MVT::f64:
2017 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2018 case MVT::v4f32: case MVT::v2f64:
2019 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2020 case MVT::v8f32: case MVT::v4f64:
2021 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2022 case MVT::v16f32: case MVT::v8f64:
2023 RRC = &X86::VR128XRegClass;
2026 return std::make_pair(RRC, Cost);
2029 unsigned X86TargetLowering::getAddressSpace() const {
2030 if (Subtarget.is64Bit())
2031 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2035 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2036 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2037 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2040 static Constant* SegmentOffset(IRBuilder<> &IRB,
2041 unsigned Offset, unsigned AddressSpace) {
2042 return ConstantExpr::getIntToPtr(
2043 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2044 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2047 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2048 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2049 // tcbhead_t; use it instead of the usual global variable (see
2050 // sysdeps/{i386,x86_64}/nptl/tls.h)
2051 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2052 if (Subtarget.isTargetFuchsia()) {
2053 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2054 return SegmentOffset(IRB, 0x10, getAddressSpace());
2056 // %fs:0x28, unless we're using a Kernel code model, in which case
2057 // it's %gs:0x28. gs:0x14 on i386.
2058 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2059 return SegmentOffset(IRB, Offset, getAddressSpace());
2063 return TargetLowering::getIRStackGuard(IRB);
2066 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2067 // MSVC CRT provides functionalities for stack protection.
2068 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2069 // MSVC CRT has a global variable holding security cookie.
2070 M.getOrInsertGlobal("__security_cookie",
2071 Type::getInt8PtrTy(M.getContext()));
2073 // MSVC CRT has a function to validate security cookie.
2074 auto *SecurityCheckCookie = cast<Function>(
2075 M.getOrInsertFunction("__security_check_cookie",
2076 Type::getVoidTy(M.getContext()),
2077 Type::getInt8PtrTy(M.getContext())));
2078 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2079 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2082 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2083 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2085 TargetLowering::insertSSPDeclarations(M);
2088 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2089 // MSVC CRT has a global variable holding security cookie.
2090 if (Subtarget.getTargetTriple().isOSMSVCRT())
2091 return M.getGlobalVariable("__security_cookie");
2092 return TargetLowering::getSDagStackGuard(M);
2095 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2096 // MSVC CRT has a function to validate security cookie.
2097 if (Subtarget.getTargetTriple().isOSMSVCRT())
2098 return M.getFunction("__security_check_cookie");
2099 return TargetLowering::getSSPStackGuardCheck(M);
2102 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2103 if (Subtarget.getTargetTriple().isOSContiki())
2104 return getDefaultSafeStackPointerLocation(IRB, false);
2106 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2107 // definition of TLS_SLOT_SAFESTACK in
2108 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2109 if (Subtarget.isTargetAndroid()) {
2110 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2112 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2113 return SegmentOffset(IRB, Offset, getAddressSpace());
2116 // Fuchsia is similar.
2117 if (Subtarget.isTargetFuchsia()) {
2118 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2119 return SegmentOffset(IRB, 0x18, getAddressSpace());
2122 return TargetLowering::getSafeStackPointerLocation(IRB);
2125 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2126 unsigned DestAS) const {
2127 assert(SrcAS != DestAS && "Expected different address spaces!");
2129 return SrcAS < 256 && DestAS < 256;
2132 //===----------------------------------------------------------------------===//
2133 // Return Value Calling Convention Implementation
2134 //===----------------------------------------------------------------------===//
2136 #include "X86GenCallingConv.inc"
2138 bool X86TargetLowering::CanLowerReturn(
2139 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2140 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2141 SmallVector<CCValAssign, 16> RVLocs;
2142 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2143 return CCInfo.CheckReturn(Outs, RetCC_X86);
2146 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2147 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2151 /// Lowers masks values (v*i1) to the local register values
2152 /// \returns DAG node after lowering to register type
2153 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2154 const SDLoc &Dl, SelectionDAG &DAG) {
2155 EVT ValVT = ValArg.getValueType();
2157 if (ValVT == MVT::v1i1)
2158 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2159 DAG.getIntPtrConstant(0, Dl));
2161 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2162 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2163 // Two stage lowering might be required
2164 // bitcast: v8i1 -> i8 / v16i1 -> i16
2165 // anyextend: i8 -> i32 / i16 -> i32
2166 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2167 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2168 if (ValLoc == MVT::i32)
2169 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2171 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2172 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2173 // One stage lowering is required
2174 // bitcast: v32i1 -> i32 / v64i1 -> i64
2175 return DAG.getBitcast(ValLoc, ValArg);
2177 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2180 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2181 static void Passv64i1ArgInRegs(
2182 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2183 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2184 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2185 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2186 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2187 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2188 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2189 "The value should reside in two registers");
2191 // Before splitting the value we cast it to i64
2192 Arg = DAG.getBitcast(MVT::i64, Arg);
2194 // Splitting the value into two i32 types
2196 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2197 DAG.getConstant(0, Dl, MVT::i32));
2198 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2199 DAG.getConstant(1, Dl, MVT::i32));
2201 // Attach the two i32 types into corresponding registers
2202 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2203 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2207 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2209 const SmallVectorImpl<ISD::OutputArg> &Outs,
2210 const SmallVectorImpl<SDValue> &OutVals,
2211 const SDLoc &dl, SelectionDAG &DAG) const {
2212 MachineFunction &MF = DAG.getMachineFunction();
2213 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2215 // In some cases we need to disable registers from the default CSR list.
2216 // For example, when they are used for argument passing.
2217 bool ShouldDisableCalleeSavedRegister =
2218 CallConv == CallingConv::X86_RegCall ||
2219 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2221 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2222 report_fatal_error("X86 interrupts may not return any value");
2224 SmallVector<CCValAssign, 16> RVLocs;
2225 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2226 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2229 SmallVector<SDValue, 6> RetOps;
2230 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2231 // Operand #1 = Bytes To Pop
2232 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2235 // Copy the result values into the output registers.
2236 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2238 CCValAssign &VA = RVLocs[I];
2239 assert(VA.isRegLoc() && "Can only return in registers!");
2241 // Add the register to the CalleeSaveDisableRegs list.
2242 if (ShouldDisableCalleeSavedRegister)
2243 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2245 SDValue ValToCopy = OutVals[OutsIndex];
2246 EVT ValVT = ValToCopy.getValueType();
2248 // Promote values to the appropriate types.
2249 if (VA.getLocInfo() == CCValAssign::SExt)
2250 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2251 else if (VA.getLocInfo() == CCValAssign::ZExt)
2252 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2253 else if (VA.getLocInfo() == CCValAssign::AExt) {
2254 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2255 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2257 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2259 else if (VA.getLocInfo() == CCValAssign::BCvt)
2260 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2262 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2263 "Unexpected FP-extend for return value.");
2265 // If this is x86-64, and we disabled SSE, we can't return FP values,
2266 // or SSE or MMX vectors.
2267 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2268 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2269 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2270 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2271 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2272 } else if (ValVT == MVT::f64 &&
2273 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2274 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2275 // llvm-gcc has never done it right and no one has noticed, so this
2276 // should be OK for now.
2277 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2278 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2281 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2282 // the RET instruction and handled by the FP Stackifier.
2283 if (VA.getLocReg() == X86::FP0 ||
2284 VA.getLocReg() == X86::FP1) {
2285 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2286 // change the value to the FP stack register class.
2287 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2288 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2289 RetOps.push_back(ValToCopy);
2290 // Don't emit a copytoreg.
2294 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2295 // which is returned in RAX / RDX.
2296 if (Subtarget.is64Bit()) {
2297 if (ValVT == MVT::x86mmx) {
2298 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2299 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2300 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2302 // If we don't have SSE2 available, convert to v4f32 so the generated
2303 // register is legal.
2304 if (!Subtarget.hasSSE2())
2305 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2310 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2312 if (VA.needsCustom()) {
2313 assert(VA.getValVT() == MVT::v64i1 &&
2314 "Currently the only custom case is when we split v64i1 to 2 regs");
2316 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2319 assert(2 == RegsToPass.size() &&
2320 "Expecting two registers after Pass64BitArgInRegs");
2322 // Add the second register to the CalleeSaveDisableRegs list.
2323 if (ShouldDisableCalleeSavedRegister)
2324 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2326 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2329 // Add nodes to the DAG and add the values into the RetOps list
2330 for (auto &Reg : RegsToPass) {
2331 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2332 Flag = Chain.getValue(1);
2333 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2337 // Swift calling convention does not require we copy the sret argument
2338 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2340 // All x86 ABIs require that for returning structs by value we copy
2341 // the sret argument into %rax/%eax (depending on ABI) for the return.
2342 // We saved the argument into a virtual register in the entry block,
2343 // so now we copy the value out and into %rax/%eax.
2345 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2346 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2347 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2348 // either case FuncInfo->setSRetReturnReg() will have been called.
2349 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2350 // When we have both sret and another return value, we should use the
2351 // original Chain stored in RetOps[0], instead of the current Chain updated
2352 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2354 // For the case of sret and another return value, we have
2355 // Chain_0 at the function entry
2356 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2357 // If we use Chain_1 in getCopyFromReg, we will have
2358 // Val = getCopyFromReg(Chain_1)
2359 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2361 // getCopyToReg(Chain_0) will be glued together with
2362 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2363 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2364 // Data dependency from Unit B to Unit A due to usage of Val in
2365 // getCopyToReg(Chain_1, Val)
2366 // Chain dependency from Unit A to Unit B
2368 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2369 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2370 getPointerTy(MF.getDataLayout()));
2373 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2374 X86::RAX : X86::EAX;
2375 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2376 Flag = Chain.getValue(1);
2378 // RAX/EAX now acts like a return value.
2380 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2382 // Add the returned register to the CalleeSaveDisableRegs list.
2383 if (ShouldDisableCalleeSavedRegister)
2384 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2387 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2388 const MCPhysReg *I =
2389 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2392 if (X86::GR64RegClass.contains(*I))
2393 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2395 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2399 RetOps[0] = Chain; // Update chain.
2401 // Add the flag if we have it.
2403 RetOps.push_back(Flag);
2405 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2406 if (CallConv == CallingConv::X86_INTR)
2407 opcode = X86ISD::IRET;
2408 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2411 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2412 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2415 SDValue TCChain = Chain;
2416 SDNode *Copy = *N->use_begin();
2417 if (Copy->getOpcode() == ISD::CopyToReg) {
2418 // If the copy has a glue operand, we conservatively assume it isn't safe to
2419 // perform a tail call.
2420 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2422 TCChain = Copy->getOperand(0);
2423 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2426 bool HasRet = false;
2427 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2429 if (UI->getOpcode() != X86ISD::RET_FLAG)
2431 // If we are returning more than one value, we can definitely
2432 // not make a tail call see PR19530
2433 if (UI->getNumOperands() > 4)
2435 if (UI->getNumOperands() == 4 &&
2436 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2448 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2449 ISD::NodeType ExtendKind) const {
2450 MVT ReturnMVT = MVT::i32;
2452 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2453 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2454 // The ABI does not require i1, i8 or i16 to be extended.
2456 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2457 // always extending i8/i16 return values, so keep doing that for now.
2459 ReturnMVT = MVT::i8;
2462 EVT MinVT = getRegisterType(Context, ReturnMVT);
2463 return VT.bitsLT(MinVT) ? MinVT : VT;
2466 /// Reads two 32 bit registers and creates a 64 bit mask value.
2467 /// \param VA The current 32 bit value that need to be assigned.
2468 /// \param NextVA The next 32 bit value that need to be assigned.
2469 /// \param Root The parent DAG node.
2470 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2471 /// glue purposes. In the case the DAG is already using
2472 /// physical register instead of virtual, we should glue
2473 /// our new SDValue to InFlag SDvalue.
2474 /// \return a new SDvalue of size 64bit.
2475 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2476 SDValue &Root, SelectionDAG &DAG,
2477 const SDLoc &Dl, const X86Subtarget &Subtarget,
2478 SDValue *InFlag = nullptr) {
2479 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2480 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2481 assert(VA.getValVT() == MVT::v64i1 &&
2482 "Expecting first location of 64 bit width type");
2483 assert(NextVA.getValVT() == VA.getValVT() &&
2484 "The locations should have the same type");
2485 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2486 "The values should reside in two registers");
2490 SDValue ArgValueLo, ArgValueHi;
2492 MachineFunction &MF = DAG.getMachineFunction();
2493 const TargetRegisterClass *RC = &X86::GR32RegClass;
2495 // Read a 32 bit value from the registers
2496 if (nullptr == InFlag) {
2497 // When no physical register is present,
2498 // create an intermediate virtual register
2499 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2500 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2501 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2502 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2504 // When a physical register is available read the value from it and glue
2505 // the reads together.
2507 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2508 *InFlag = ArgValueLo.getValue(2);
2510 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2511 *InFlag = ArgValueHi.getValue(2);
2514 // Convert the i32 type into v32i1 type
2515 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2517 // Convert the i32 type into v32i1 type
2518 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2520 // Concatenate the two values together
2521 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2524 /// The function will lower a register of various sizes (8/16/32/64)
2525 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2526 /// \returns a DAG node contains the operand after lowering to mask type.
2527 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2528 const EVT &ValLoc, const SDLoc &Dl,
2529 SelectionDAG &DAG) {
2530 SDValue ValReturned = ValArg;
2532 if (ValVT == MVT::v1i1)
2533 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2535 if (ValVT == MVT::v64i1) {
2536 // In 32 bit machine, this case is handled by getv64i1Argument
2537 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2538 // In 64 bit machine, There is no need to truncate the value only bitcast
2541 switch (ValVT.getSimpleVT().SimpleTy) {
2552 llvm_unreachable("Expecting a vector of i1 types");
2555 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2557 return DAG.getBitcast(ValVT, ValReturned);
2560 /// Lower the result values of a call into the
2561 /// appropriate copies out of appropriate physical registers.
2563 SDValue X86TargetLowering::LowerCallResult(
2564 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2565 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2566 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2567 uint32_t *RegMask) const {
2569 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2570 // Assign locations to each value returned by this call.
2571 SmallVector<CCValAssign, 16> RVLocs;
2572 bool Is64Bit = Subtarget.is64Bit();
2573 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2575 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2577 // Copy all of the result registers out of their specified physreg.
2578 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2580 CCValAssign &VA = RVLocs[I];
2581 EVT CopyVT = VA.getLocVT();
2583 // In some calling conventions we need to remove the used registers
2584 // from the register mask.
2586 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2587 SubRegs.isValid(); ++SubRegs)
2588 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2591 // If this is x86-64, and we disabled SSE, we can't return FP values
2592 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2593 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2594 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2595 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2598 // If we prefer to use the value in xmm registers, copy it out as f80 and
2599 // use a truncate to move it from fp stack reg to xmm reg.
2600 bool RoundAfterCopy = false;
2601 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2602 isScalarFPTypeInSSEReg(VA.getValVT())) {
2603 if (!Subtarget.hasX87())
2604 report_fatal_error("X87 register return with X87 disabled");
2606 RoundAfterCopy = (CopyVT != VA.getLocVT());
2610 if (VA.needsCustom()) {
2611 assert(VA.getValVT() == MVT::v64i1 &&
2612 "Currently the only custom case is when we split v64i1 to 2 regs");
2614 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2616 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2618 Val = Chain.getValue(0);
2619 InFlag = Chain.getValue(2);
2623 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2624 // This truncation won't change the value.
2625 DAG.getIntPtrConstant(1, dl));
2627 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2628 if (VA.getValVT().isVector() &&
2629 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2630 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2631 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2632 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2634 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2637 InVals.push_back(Val);
2643 //===----------------------------------------------------------------------===//
2644 // C & StdCall & Fast Calling Convention implementation
2645 //===----------------------------------------------------------------------===//
2646 // StdCall calling convention seems to be standard for many Windows' API
2647 // routines and around. It differs from C calling convention just a little:
2648 // callee should clean up the stack, not caller. Symbols should be also
2649 // decorated in some fancy way :) It doesn't support any vector arguments.
2650 // For info on fast calling convention see Fast Calling Convention (tail call)
2651 // implementation LowerX86_32FastCCCallTo.
2653 /// CallIsStructReturn - Determines whether a call uses struct return
2655 enum StructReturnType {
2660 static StructReturnType
2661 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2663 return NotStructReturn;
2665 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2666 if (!Flags.isSRet())
2667 return NotStructReturn;
2668 if (Flags.isInReg() || IsMCU)
2669 return RegStructReturn;
2670 return StackStructReturn;
2673 /// Determines whether a function uses struct return semantics.
2674 static StructReturnType
2675 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2677 return NotStructReturn;
2679 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2680 if (!Flags.isSRet())
2681 return NotStructReturn;
2682 if (Flags.isInReg() || IsMCU)
2683 return RegStructReturn;
2684 return StackStructReturn;
2687 /// Make a copy of an aggregate at address specified by "Src" to address
2688 /// "Dst" with size and alignment information specified by the specific
2689 /// parameter attribute. The copy will be passed as a byval function parameter.
2690 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2691 SDValue Chain, ISD::ArgFlagsTy Flags,
2692 SelectionDAG &DAG, const SDLoc &dl) {
2693 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2695 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2696 /*isVolatile*/false, /*AlwaysInline=*/true,
2697 /*isTailCall*/false,
2698 MachinePointerInfo(), MachinePointerInfo());
2701 /// Return true if the calling convention is one that we can guarantee TCO for.
2702 static bool canGuaranteeTCO(CallingConv::ID CC) {
2703 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2704 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2705 CC == CallingConv::HHVM);
2708 /// Return true if we might ever do TCO for calls with this calling convention.
2709 static bool mayTailCallThisCC(CallingConv::ID CC) {
2711 // C calling conventions:
2712 case CallingConv::C:
2713 case CallingConv::Win64:
2714 case CallingConv::X86_64_SysV:
2715 // Callee pop conventions:
2716 case CallingConv::X86_ThisCall:
2717 case CallingConv::X86_StdCall:
2718 case CallingConv::X86_VectorCall:
2719 case CallingConv::X86_FastCall:
2722 return canGuaranteeTCO(CC);
2726 /// Return true if the function is being made into a tailcall target by
2727 /// changing its ABI.
2728 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2729 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2732 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2734 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2735 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2738 ImmutableCallSite CS(CI);
2739 CallingConv::ID CalleeCC = CS.getCallingConv();
2740 if (!mayTailCallThisCC(CalleeCC))
2747 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2748 const SmallVectorImpl<ISD::InputArg> &Ins,
2749 const SDLoc &dl, SelectionDAG &DAG,
2750 const CCValAssign &VA,
2751 MachineFrameInfo &MFI, unsigned i) const {
2752 // Create the nodes corresponding to a load from this parameter slot.
2753 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2754 bool AlwaysUseMutable = shouldGuaranteeTCO(
2755 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2756 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2758 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2760 // If value is passed by pointer we have address passed instead of the value
2761 // itself. No need to extend if the mask value and location share the same
2763 bool ExtendedInMem =
2764 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2765 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2767 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2768 ValVT = VA.getLocVT();
2770 ValVT = VA.getValVT();
2772 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2773 // taken by a return address.
2775 if (CallConv == CallingConv::X86_INTR) {
2776 // X86 interrupts may take one or two arguments.
2777 // On the stack there will be no return address as in regular call.
2778 // Offset of last argument need to be set to -4/-8 bytes.
2779 // Where offset of the first argument out of two, should be set to 0 bytes.
2780 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2781 if (Subtarget.is64Bit() && Ins.size() == 2) {
2782 // The stack pointer needs to be realigned for 64 bit handlers with error
2783 // code, so the argument offset changes by 8 bytes.
2788 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2789 // changed with more analysis.
2790 // In case of tail call optimization mark all arguments mutable. Since they
2791 // could be overwritten by lowering of arguments in case of a tail call.
2792 if (Flags.isByVal()) {
2793 unsigned Bytes = Flags.getByValSize();
2794 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2795 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2796 // Adjust SP offset of interrupt parameter.
2797 if (CallConv == CallingConv::X86_INTR) {
2798 MFI.setObjectOffset(FI, Offset);
2800 return DAG.getFrameIndex(FI, PtrVT);
2803 // This is an argument in memory. We might be able to perform copy elision.
2804 if (Flags.isCopyElisionCandidate()) {
2805 EVT ArgVT = Ins[i].ArgVT;
2807 if (Ins[i].PartOffset == 0) {
2808 // If this is a one-part value or the first part of a multi-part value,
2809 // create a stack object for the entire argument value type and return a
2810 // load from our portion of it. This assumes that if the first part of an
2811 // argument is in memory, the rest will also be in memory.
2812 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2813 /*Immutable=*/false);
2814 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2816 ValVT, dl, Chain, PartAddr,
2817 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2819 // This is not the first piece of an argument in memory. See if there is
2820 // already a fixed stack object including this offset. If so, assume it
2821 // was created by the PartOffset == 0 branch above and create a load from
2822 // the appropriate offset into it.
2823 int64_t PartBegin = VA.getLocMemOffset();
2824 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2825 int FI = MFI.getObjectIndexBegin();
2826 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2827 int64_t ObjBegin = MFI.getObjectOffset(FI);
2828 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2829 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2832 if (MFI.isFixedObjectIndex(FI)) {
2834 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2835 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2837 ValVT, dl, Chain, Addr,
2838 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2839 Ins[i].PartOffset));
2844 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2845 VA.getLocMemOffset(), isImmutable);
2847 // Set SExt or ZExt flag.
2848 if (VA.getLocInfo() == CCValAssign::ZExt) {
2849 MFI.setObjectZExt(FI, true);
2850 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2851 MFI.setObjectSExt(FI, true);
2854 // Adjust SP offset of interrupt parameter.
2855 if (CallConv == CallingConv::X86_INTR) {
2856 MFI.setObjectOffset(FI, Offset);
2859 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2860 SDValue Val = DAG.getLoad(
2861 ValVT, dl, Chain, FIN,
2862 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2863 return ExtendedInMem
2864 ? (VA.getValVT().isVector()
2865 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2866 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2870 // FIXME: Get this from tablegen.
2871 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2872 const X86Subtarget &Subtarget) {
2873 assert(Subtarget.is64Bit());
2875 if (Subtarget.isCallingConvWin64(CallConv)) {
2876 static const MCPhysReg GPR64ArgRegsWin64[] = {
2877 X86::RCX, X86::RDX, X86::R8, X86::R9
2879 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2882 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2883 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2885 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2888 // FIXME: Get this from tablegen.
2889 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2890 CallingConv::ID CallConv,
2891 const X86Subtarget &Subtarget) {
2892 assert(Subtarget.is64Bit());
2893 if (Subtarget.isCallingConvWin64(CallConv)) {
2894 // The XMM registers which might contain var arg parameters are shadowed
2895 // in their paired GPR. So we only need to save the GPR to their home
2897 // TODO: __vectorcall will change this.
2901 const Function &F = MF.getFunction();
2902 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
2903 bool isSoftFloat = Subtarget.useSoftFloat();
2904 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2905 "SSE register cannot be used when SSE is disabled!");
2906 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2907 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2911 static const MCPhysReg XMMArgRegs64Bit[] = {
2912 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2913 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2915 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2919 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2920 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2921 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2922 return A.getValNo() < B.getValNo();
2927 SDValue X86TargetLowering::LowerFormalArguments(
2928 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2929 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2930 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2931 MachineFunction &MF = DAG.getMachineFunction();
2932 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2933 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2935 const Function &F = MF.getFunction();
2936 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
2937 F.getName() == "main")
2938 FuncInfo->setForceFramePointer(true);
2940 MachineFrameInfo &MFI = MF.getFrameInfo();
2941 bool Is64Bit = Subtarget.is64Bit();
2942 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2945 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2946 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2948 if (CallConv == CallingConv::X86_INTR) {
2949 bool isLegal = Ins.size() == 1 ||
2950 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2951 (!Is64Bit && Ins[1].VT == MVT::i32)));
2953 report_fatal_error("X86 interrupts may take one or two arguments");
2956 // Assign locations to all of the incoming arguments.
2957 SmallVector<CCValAssign, 16> ArgLocs;
2958 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2960 // Allocate shadow area for Win64.
2962 CCInfo.AllocateStack(32, 8);
2964 CCInfo.AnalyzeArguments(Ins, CC_X86);
2966 // In vectorcall calling convention a second pass is required for the HVA
2968 if (CallingConv::X86_VectorCall == CallConv) {
2969 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2972 // The next loop assumes that the locations are in the same order of the
2974 assert(isSortedByValueNo(ArgLocs) &&
2975 "Argument Location list must be sorted before lowering");
2978 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2980 assert(InsIndex < Ins.size() && "Invalid Ins index");
2981 CCValAssign &VA = ArgLocs[I];
2983 if (VA.isRegLoc()) {
2984 EVT RegVT = VA.getLocVT();
2985 if (VA.needsCustom()) {
2987 VA.getValVT() == MVT::v64i1 &&
2988 "Currently the only custom case is when we split v64i1 to 2 regs");
2990 // v64i1 values, in regcall calling convention, that are
2991 // compiled to 32 bit arch, are split up into two registers.
2993 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2995 const TargetRegisterClass *RC;
2996 if (RegVT == MVT::i32)
2997 RC = &X86::GR32RegClass;
2998 else if (Is64Bit && RegVT == MVT::i64)
2999 RC = &X86::GR64RegClass;
3000 else if (RegVT == MVT::f32)
3001 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3002 else if (RegVT == MVT::f64)
3003 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3004 else if (RegVT == MVT::f80)
3005 RC = &X86::RFP80RegClass;
3006 else if (RegVT == MVT::f128)
3007 RC = &X86::FR128RegClass;
3008 else if (RegVT.is512BitVector())
3009 RC = &X86::VR512RegClass;
3010 else if (RegVT.is256BitVector())
3011 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3012 else if (RegVT.is128BitVector())
3013 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3014 else if (RegVT == MVT::x86mmx)
3015 RC = &X86::VR64RegClass;
3016 else if (RegVT == MVT::v1i1)
3017 RC = &X86::VK1RegClass;
3018 else if (RegVT == MVT::v8i1)
3019 RC = &X86::VK8RegClass;
3020 else if (RegVT == MVT::v16i1)
3021 RC = &X86::VK16RegClass;
3022 else if (RegVT == MVT::v32i1)
3023 RC = &X86::VK32RegClass;
3024 else if (RegVT == MVT::v64i1)
3025 RC = &X86::VK64RegClass;
3027 llvm_unreachable("Unknown argument type!");
3029 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3030 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3033 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3034 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3036 if (VA.getLocInfo() == CCValAssign::SExt)
3037 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3038 DAG.getValueType(VA.getValVT()));
3039 else if (VA.getLocInfo() == CCValAssign::ZExt)
3040 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3041 DAG.getValueType(VA.getValVT()));
3042 else if (VA.getLocInfo() == CCValAssign::BCvt)
3043 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3045 if (VA.isExtInLoc()) {
3046 // Handle MMX values passed in XMM regs.
3047 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3048 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3049 else if (VA.getValVT().isVector() &&
3050 VA.getValVT().getScalarType() == MVT::i1 &&
3051 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3052 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3053 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3054 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3056 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3059 assert(VA.isMemLoc());
3061 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3064 // If value is passed via pointer - do a load.
3065 if (VA.getLocInfo() == CCValAssign::Indirect)
3067 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3069 InVals.push_back(ArgValue);
3072 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3073 // Swift calling convention does not require we copy the sret argument
3074 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3075 if (CallConv == CallingConv::Swift)
3078 // All x86 ABIs require that for returning structs by value we copy the
3079 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3080 // the argument into a virtual register so that we can access it from the
3082 if (Ins[I].Flags.isSRet()) {
3083 unsigned Reg = FuncInfo->getSRetReturnReg();
3085 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3086 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3087 FuncInfo->setSRetReturnReg(Reg);
3089 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3090 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3095 unsigned StackSize = CCInfo.getNextStackOffset();
3096 // Align stack specially for tail calls.
3097 if (shouldGuaranteeTCO(CallConv,
3098 MF.getTarget().Options.GuaranteedTailCallOpt))
3099 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3101 // If the function takes variable number of arguments, make a frame index for
3102 // the start of the first vararg value... for expansion of llvm.va_start. We
3103 // can skip this if there are no va_start calls.
3104 if (MFI.hasVAStart() &&
3105 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3106 CallConv != CallingConv::X86_ThisCall))) {
3107 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3110 // Figure out if XMM registers are in use.
3111 assert(!(Subtarget.useSoftFloat() &&
3112 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3113 "SSE register cannot be used when SSE is disabled!");
3115 // 64-bit calling conventions support varargs and register parameters, so we
3116 // have to do extra work to spill them in the prologue.
3117 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3118 // Find the first unallocated argument registers.
3119 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3120 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3121 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3122 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3123 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3124 "SSE register cannot be used when SSE is disabled!");
3126 // Gather all the live in physical registers.
3127 SmallVector<SDValue, 6> LiveGPRs;
3128 SmallVector<SDValue, 8> LiveXMMRegs;
3130 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3131 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3133 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3135 if (!ArgXMMs.empty()) {
3136 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3137 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3138 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3139 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3140 LiveXMMRegs.push_back(
3141 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3146 // Get to the caller-allocated home save location. Add 8 to account
3147 // for the return address.
3148 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3149 FuncInfo->setRegSaveFrameIndex(
3150 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3151 // Fixup to set vararg frame on shadow area (4 x i64).
3153 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3155 // For X86-64, if there are vararg parameters that are passed via
3156 // registers, then we must store them to their spots on the stack so
3157 // they may be loaded by dereferencing the result of va_next.
3158 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3159 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3160 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3161 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3164 // Store the integer parameter registers.
3165 SmallVector<SDValue, 8> MemOps;
3166 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3167 getPointerTy(DAG.getDataLayout()));
3168 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3169 for (SDValue Val : LiveGPRs) {
3170 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3171 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3173 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3174 MachinePointerInfo::getFixedStack(
3175 DAG.getMachineFunction(),
3176 FuncInfo->getRegSaveFrameIndex(), Offset));
3177 MemOps.push_back(Store);
3181 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3182 // Now store the XMM (fp + vector) parameter registers.
3183 SmallVector<SDValue, 12> SaveXMMOps;
3184 SaveXMMOps.push_back(Chain);
3185 SaveXMMOps.push_back(ALVal);
3186 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3187 FuncInfo->getRegSaveFrameIndex(), dl));
3188 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3189 FuncInfo->getVarArgsFPOffset(), dl));
3190 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3192 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3193 MVT::Other, SaveXMMOps));
3196 if (!MemOps.empty())
3197 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3200 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3201 // Find the largest legal vector type.
3202 MVT VecVT = MVT::Other;
3203 // FIXME: Only some x86_32 calling conventions support AVX512.
3204 if (Subtarget.hasAVX512() &&
3205 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3206 CallConv == CallingConv::Intel_OCL_BI)))
3207 VecVT = MVT::v16f32;
3208 else if (Subtarget.hasAVX())
3210 else if (Subtarget.hasSSE2())
3213 // We forward some GPRs and some vector types.
3214 SmallVector<MVT, 2> RegParmTypes;
3215 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3216 RegParmTypes.push_back(IntVT);
3217 if (VecVT != MVT::Other)
3218 RegParmTypes.push_back(VecVT);
3220 // Compute the set of forwarded registers. The rest are scratch.
3221 SmallVectorImpl<ForwardedRegister> &Forwards =
3222 FuncInfo->getForwardedMustTailRegParms();
3223 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3225 // Conservatively forward AL on x86_64, since it might be used for varargs.
3226 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3227 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3228 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3231 // Copy all forwards from physical to virtual registers.
3232 for (ForwardedRegister &F : Forwards) {
3233 // FIXME: Can we use a less constrained schedule?
3234 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3235 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3236 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3240 // Some CCs need callee pop.
3241 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3242 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3243 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3244 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3245 // X86 interrupts must pop the error code (and the alignment padding) if
3247 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3249 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3250 // If this is an sret function, the return should pop the hidden pointer.
3251 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3252 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3253 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3254 FuncInfo->setBytesToPopOnReturn(4);
3258 // RegSaveFrameIndex is X86-64 only.
3259 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3260 if (CallConv == CallingConv::X86_FastCall ||
3261 CallConv == CallingConv::X86_ThisCall)
3262 // fastcc functions can't have varargs.
3263 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3266 FuncInfo->setArgumentStackSize(StackSize);
3268 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3269 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3270 if (Personality == EHPersonality::CoreCLR) {
3272 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3273 // that we'd prefer this slot be allocated towards the bottom of the frame
3274 // (i.e. near the stack pointer after allocating the frame). Every
3275 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3276 // offset from the bottom of this and each funclet's frame must be the
3277 // same, so the size of funclets' (mostly empty) frames is dictated by
3278 // how far this slot is from the bottom (since they allocate just enough
3279 // space to accommodate holding this slot at the correct offset).
3280 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3281 EHInfo->PSPSymFrameIdx = PSPSymFI;
3285 if (CallConv == CallingConv::X86_RegCall ||
3286 F.hasFnAttribute("no_caller_saved_registers")) {
3287 MachineRegisterInfo &MRI = MF.getRegInfo();
3288 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3289 MRI.disableCalleeSavedRegister(Pair.first);
3295 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3296 SDValue Arg, const SDLoc &dl,
3298 const CCValAssign &VA,
3299 ISD::ArgFlagsTy Flags) const {
3300 unsigned LocMemOffset = VA.getLocMemOffset();
3301 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3302 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3304 if (Flags.isByVal())
3305 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3307 return DAG.getStore(
3308 Chain, dl, Arg, PtrOff,
3309 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3312 /// Emit a load of return address if tail call
3313 /// optimization is performed and it is required.
3314 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3315 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3316 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3317 // Adjust the Return address stack slot.
3318 EVT VT = getPointerTy(DAG.getDataLayout());
3319 OutRetAddr = getReturnAddressFrameIndex(DAG);
3321 // Load the "old" Return address.
3322 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3323 return SDValue(OutRetAddr.getNode(), 1);
3326 /// Emit a store of the return address if tail call
3327 /// optimization is performed and it is required (FPDiff!=0).
3328 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3329 SDValue Chain, SDValue RetAddrFrIdx,
3330 EVT PtrVT, unsigned SlotSize,
3331 int FPDiff, const SDLoc &dl) {
3332 // Store the return address to the appropriate stack slot.
3333 if (!FPDiff) return Chain;
3334 // Calculate the new stack slot for the return address.
3335 int NewReturnAddrFI =
3336 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3338 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3339 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3340 MachinePointerInfo::getFixedStack(
3341 DAG.getMachineFunction(), NewReturnAddrFI));
3345 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3346 /// operation of specified width.
3347 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3349 unsigned NumElems = VT.getVectorNumElements();
3350 SmallVector<int, 8> Mask;
3351 Mask.push_back(NumElems);
3352 for (unsigned i = 1; i != NumElems; ++i)
3354 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3358 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3359 SmallVectorImpl<SDValue> &InVals) const {
3360 SelectionDAG &DAG = CLI.DAG;
3362 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3363 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3364 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3365 SDValue Chain = CLI.Chain;
3366 SDValue Callee = CLI.Callee;
3367 CallingConv::ID CallConv = CLI.CallConv;
3368 bool &isTailCall = CLI.IsTailCall;
3369 bool isVarArg = CLI.IsVarArg;
3371 MachineFunction &MF = DAG.getMachineFunction();
3372 bool Is64Bit = Subtarget.is64Bit();
3373 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3374 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3375 bool IsSibcall = false;
3376 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3377 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3378 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3379 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3380 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3381 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3383 if (CallConv == CallingConv::X86_INTR)
3384 report_fatal_error("X86 interrupts may not be called directly");
3386 if (Attr.getValueAsString() == "true")
3389 if (Subtarget.isPICStyleGOT() &&
3390 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3391 // If we are using a GOT, disable tail calls to external symbols with
3392 // default visibility. Tail calling such a symbol requires using a GOT
3393 // relocation, which forces early binding of the symbol. This breaks code
3394 // that require lazy function symbol resolution. Using musttail or
3395 // GuaranteedTailCallOpt will override this.
3396 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3397 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3398 G->getGlobal()->hasDefaultVisibility()))
3402 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3404 // Force this to be a tail call. The verifier rules are enough to ensure
3405 // that we can lower this successfully without moving the return address
3408 } else if (isTailCall) {
3409 // Check if it's really possible to do a tail call.
3410 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3411 isVarArg, SR != NotStructReturn,
3412 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3413 Outs, OutVals, Ins, DAG);
3415 // Sibcalls are automatically detected tailcalls which do not require
3417 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3424 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3425 "Var args not supported with calling convention fastcc, ghc or hipe");
3427 // Analyze operands of the call, assigning locations to each operand.
3428 SmallVector<CCValAssign, 16> ArgLocs;
3429 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3431 // Allocate shadow area for Win64.
3433 CCInfo.AllocateStack(32, 8);
3435 CCInfo.AnalyzeArguments(Outs, CC_X86);
3437 // In vectorcall calling convention a second pass is required for the HVA
3439 if (CallingConv::X86_VectorCall == CallConv) {
3440 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3443 // Get a count of how many bytes are to be pushed on the stack.
3444 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3446 // This is a sibcall. The memory operands are available in caller's
3447 // own caller's stack.
3449 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3450 canGuaranteeTCO(CallConv))
3451 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3454 if (isTailCall && !IsSibcall && !IsMustTail) {
3455 // Lower arguments at fp - stackoffset + fpdiff.
3456 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3458 FPDiff = NumBytesCallerPushed - NumBytes;
3460 // Set the delta of movement of the returnaddr stackslot.
3461 // But only set if delta is greater than previous delta.
3462 if (FPDiff < X86Info->getTCReturnAddrDelta())
3463 X86Info->setTCReturnAddrDelta(FPDiff);
3466 unsigned NumBytesToPush = NumBytes;
3467 unsigned NumBytesToPop = NumBytes;
3469 // If we have an inalloca argument, all stack space has already been allocated
3470 // for us and be right at the top of the stack. We don't support multiple
3471 // arguments passed in memory when using inalloca.
3472 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3474 if (!ArgLocs.back().isMemLoc())
3475 report_fatal_error("cannot use inalloca attribute on a register "
3477 if (ArgLocs.back().getLocMemOffset() != 0)
3478 report_fatal_error("any parameter with the inalloca attribute must be "
3479 "the only memory argument");
3483 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3484 NumBytes - NumBytesToPush, dl);
3486 SDValue RetAddrFrIdx;
3487 // Load return address for tail calls.
3488 if (isTailCall && FPDiff)
3489 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3490 Is64Bit, FPDiff, dl);
3492 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3493 SmallVector<SDValue, 8> MemOpChains;
3496 // The next loop assumes that the locations are in the same order of the
3498 assert(isSortedByValueNo(ArgLocs) &&
3499 "Argument Location list must be sorted before lowering");
3501 // Walk the register/memloc assignments, inserting copies/loads. In the case
3502 // of tail call optimization arguments are handle later.
3503 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3504 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3506 assert(OutIndex < Outs.size() && "Invalid Out index");
3507 // Skip inalloca arguments, they have already been written.
3508 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3509 if (Flags.isInAlloca())
3512 CCValAssign &VA = ArgLocs[I];
3513 EVT RegVT = VA.getLocVT();
3514 SDValue Arg = OutVals[OutIndex];
3515 bool isByVal = Flags.isByVal();
3517 // Promote the value if needed.
3518 switch (VA.getLocInfo()) {
3519 default: llvm_unreachable("Unknown loc info!");
3520 case CCValAssign::Full: break;
3521 case CCValAssign::SExt:
3522 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3524 case CCValAssign::ZExt:
3525 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3527 case CCValAssign::AExt:
3528 if (Arg.getValueType().isVector() &&
3529 Arg.getValueType().getVectorElementType() == MVT::i1)
3530 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3531 else if (RegVT.is128BitVector()) {
3532 // Special case: passing MMX values in XMM registers.
3533 Arg = DAG.getBitcast(MVT::i64, Arg);
3534 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3535 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3537 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3539 case CCValAssign::BCvt:
3540 Arg = DAG.getBitcast(RegVT, Arg);
3542 case CCValAssign::Indirect: {
3543 // Store the argument.
3544 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3545 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3546 Chain = DAG.getStore(
3547 Chain, dl, Arg, SpillSlot,
3548 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3554 if (VA.needsCustom()) {
3555 assert(VA.getValVT() == MVT::v64i1 &&
3556 "Currently the only custom case is when we split v64i1 to 2 regs");
3557 // Split v64i1 value into two registers
3558 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3560 } else if (VA.isRegLoc()) {
3561 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3562 if (isVarArg && IsWin64) {
3563 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3564 // shadow reg if callee is a varargs function.
3565 unsigned ShadowReg = 0;
3566 switch (VA.getLocReg()) {
3567 case X86::XMM0: ShadowReg = X86::RCX; break;
3568 case X86::XMM1: ShadowReg = X86::RDX; break;
3569 case X86::XMM2: ShadowReg = X86::R8; break;
3570 case X86::XMM3: ShadowReg = X86::R9; break;
3573 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3575 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3576 assert(VA.isMemLoc());
3577 if (!StackPtr.getNode())
3578 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3579 getPointerTy(DAG.getDataLayout()));
3580 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3581 dl, DAG, VA, Flags));
3585 if (!MemOpChains.empty())
3586 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3588 if (Subtarget.isPICStyleGOT()) {
3589 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3592 RegsToPass.push_back(std::make_pair(
3593 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3594 getPointerTy(DAG.getDataLayout()))));
3596 // If we are tail calling and generating PIC/GOT style code load the
3597 // address of the callee into ECX. The value in ecx is used as target of
3598 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3599 // for tail calls on PIC/GOT architectures. Normally we would just put the
3600 // address of GOT into ebx and then call target@PLT. But for tail calls
3601 // ebx would be restored (since ebx is callee saved) before jumping to the
3604 // Note: The actual moving to ECX is done further down.
3605 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3606 if (G && !G->getGlobal()->hasLocalLinkage() &&
3607 G->getGlobal()->hasDefaultVisibility())
3608 Callee = LowerGlobalAddress(Callee, DAG);
3609 else if (isa<ExternalSymbolSDNode>(Callee))
3610 Callee = LowerExternalSymbol(Callee, DAG);
3614 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3615 // From AMD64 ABI document:
3616 // For calls that may call functions that use varargs or stdargs
3617 // (prototype-less calls or calls to functions containing ellipsis (...) in
3618 // the declaration) %al is used as hidden argument to specify the number
3619 // of SSE registers used. The contents of %al do not need to match exactly
3620 // the number of registers, but must be an ubound on the number of SSE
3621 // registers used and is in the range 0 - 8 inclusive.
3623 // Count the number of XMM registers allocated.
3624 static const MCPhysReg XMMArgRegs[] = {
3625 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3626 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3628 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3629 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3630 && "SSE registers cannot be used when SSE is disabled");
3632 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3633 DAG.getConstant(NumXMMRegs, dl,
3637 if (isVarArg && IsMustTail) {
3638 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3639 for (const auto &F : Forwards) {
3640 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3641 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3645 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3646 // don't need this because the eligibility check rejects calls that require
3647 // shuffling arguments passed in memory.
3648 if (!IsSibcall && isTailCall) {
3649 // Force all the incoming stack arguments to be loaded from the stack
3650 // before any new outgoing arguments are stored to the stack, because the
3651 // outgoing stack slots may alias the incoming argument stack slots, and
3652 // the alias isn't otherwise explicit. This is slightly more conservative
3653 // than necessary, because it means that each store effectively depends
3654 // on every argument instead of just those arguments it would clobber.
3655 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3657 SmallVector<SDValue, 8> MemOpChains2;
3660 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3662 CCValAssign &VA = ArgLocs[I];
3664 if (VA.isRegLoc()) {
3665 if (VA.needsCustom()) {
3666 assert((CallConv == CallingConv::X86_RegCall) &&
3667 "Expecting custom case only in regcall calling convention");
3668 // This means that we are in special case where one argument was
3669 // passed through two register locations - Skip the next location
3676 assert(VA.isMemLoc());
3677 SDValue Arg = OutVals[OutsIndex];
3678 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3679 // Skip inalloca arguments. They don't require any work.
3680 if (Flags.isInAlloca())
3682 // Create frame index.
3683 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3684 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3685 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3686 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3688 if (Flags.isByVal()) {
3689 // Copy relative to framepointer.
3690 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3691 if (!StackPtr.getNode())
3692 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3693 getPointerTy(DAG.getDataLayout()));
3694 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3697 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3701 // Store relative to framepointer.
3702 MemOpChains2.push_back(DAG.getStore(
3703 ArgChain, dl, Arg, FIN,
3704 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3708 if (!MemOpChains2.empty())
3709 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3711 // Store the return address to the appropriate stack slot.
3712 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3713 getPointerTy(DAG.getDataLayout()),
3714 RegInfo->getSlotSize(), FPDiff, dl);
3717 // Build a sequence of copy-to-reg nodes chained together with token chain
3718 // and flag operands which copy the outgoing args into registers.
3720 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3721 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3722 RegsToPass[i].second, InFlag);
3723 InFlag = Chain.getValue(1);
3726 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3727 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3728 // In the 64-bit large code model, we have to make all calls
3729 // through a register, since the call instruction's 32-bit
3730 // pc-relative offset may not be large enough to hold the whole
3732 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3733 // If the callee is a GlobalAddress node (quite common, every direct call
3734 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3736 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3738 // We should use extra load for direct calls to dllimported functions in
3740 const GlobalValue *GV = G->getGlobal();
3741 if (!GV->hasDLLImportStorageClass()) {
3742 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3744 Callee = DAG.getTargetGlobalAddress(
3745 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3747 if (OpFlags == X86II::MO_GOTPCREL) {
3749 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3750 getPointerTy(DAG.getDataLayout()), Callee);
3751 // Add extra indirection
3752 Callee = DAG.getLoad(
3753 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3754 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3757 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3758 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
3759 unsigned char OpFlags =
3760 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3762 Callee = DAG.getTargetExternalSymbol(
3763 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3764 } else if (Subtarget.isTarget64BitILP32() &&
3765 Callee->getValueType(0) == MVT::i32) {
3766 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3767 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3770 // Returns a chain & a flag for retval copy to use.
3771 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3772 SmallVector<SDValue, 8> Ops;
3774 if (!IsSibcall && isTailCall) {
3775 Chain = DAG.getCALLSEQ_END(Chain,
3776 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3777 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3778 InFlag = Chain.getValue(1);
3781 Ops.push_back(Chain);
3782 Ops.push_back(Callee);
3785 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3787 // Add argument registers to the end of the list so that they are known live
3789 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3790 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3791 RegsToPass[i].second.getValueType()));
3793 // Add a register mask operand representing the call-preserved registers.
3794 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3795 // set X86_INTR calling convention because it has the same CSR mask
3796 // (same preserved registers).
3797 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3798 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3799 assert(Mask && "Missing call preserved mask for calling convention");
3801 // If this is an invoke in a 32-bit function using a funclet-based
3802 // personality, assume the function clobbers all registers. If an exception
3803 // is thrown, the runtime will not restore CSRs.
3804 // FIXME: Model this more precisely so that we can register allocate across
3805 // the normal edge and spill and fill across the exceptional edge.
3806 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
3807 const Function &CallerFn = MF.getFunction();
3808 EHPersonality Pers =
3809 CallerFn.hasPersonalityFn()
3810 ? classifyEHPersonality(CallerFn.getPersonalityFn())
3811 : EHPersonality::Unknown;
3812 if (isFuncletEHPersonality(Pers))
3813 Mask = RegInfo->getNoPreservedMask();
3816 // Define a new register mask from the existing mask.
3817 uint32_t *RegMask = nullptr;
3819 // In some calling conventions we need to remove the used physical registers
3820 // from the reg mask.
3821 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3822 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3824 // Allocate a new Reg Mask and copy Mask.
3825 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3826 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3827 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3829 // Make sure all sub registers of the argument registers are reset
3831 for (auto const &RegPair : RegsToPass)
3832 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3833 SubRegs.isValid(); ++SubRegs)
3834 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3836 // Create the RegMask Operand according to our updated mask.
3837 Ops.push_back(DAG.getRegisterMask(RegMask));
3839 // Create the RegMask Operand according to the static mask.
3840 Ops.push_back(DAG.getRegisterMask(Mask));
3843 if (InFlag.getNode())
3844 Ops.push_back(InFlag);
3848 //// If this is the first return lowered for this function, add the regs
3849 //// to the liveout set for the function.
3850 // This isn't right, although it's probably harmless on x86; liveouts
3851 // should be computed from returns not tail calls. Consider a void
3852 // function making a tail call to a function returning int.
3853 MF.getFrameInfo().setHasTailCall();
3854 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3857 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3858 InFlag = Chain.getValue(1);
3860 // Create the CALLSEQ_END node.
3861 unsigned NumBytesForCalleeToPop;
3862 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3863 DAG.getTarget().Options.GuaranteedTailCallOpt))
3864 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3865 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3866 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3867 SR == StackStructReturn)
3868 // If this is a call to a struct-return function, the callee
3869 // pops the hidden struct pointer, so we have to push it back.
3870 // This is common for Darwin/X86, Linux & Mingw32 targets.
3871 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3872 NumBytesForCalleeToPop = 4;
3874 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3876 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3877 // No need to reset the stack after the call if the call doesn't return. To
3878 // make the MI verify, we'll pretend the callee does it for us.
3879 NumBytesForCalleeToPop = NumBytes;
3882 // Returns a flag for retval copy to use.
3884 Chain = DAG.getCALLSEQ_END(Chain,
3885 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3886 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3889 InFlag = Chain.getValue(1);
3892 // Handle result values, copying them out of physregs into vregs that we
3894 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3898 //===----------------------------------------------------------------------===//
3899 // Fast Calling Convention (tail call) implementation
3900 //===----------------------------------------------------------------------===//
3902 // Like std call, callee cleans arguments, convention except that ECX is
3903 // reserved for storing the tail called function address. Only 2 registers are
3904 // free for argument passing (inreg). Tail call optimization is performed
3906 // * tailcallopt is enabled
3907 // * caller/callee are fastcc
3908 // On X86_64 architecture with GOT-style position independent code only local
3909 // (within module) calls are supported at the moment.
3910 // To keep the stack aligned according to platform abi the function
3911 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3912 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3913 // If a tail called function callee has more arguments than the caller the
3914 // caller needs to make sure that there is room to move the RETADDR to. This is
3915 // achieved by reserving an area the size of the argument delta right after the
3916 // original RETADDR, but before the saved framepointer or the spilled registers
3917 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3929 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3932 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3933 SelectionDAG& DAG) const {
3934 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3935 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3936 unsigned StackAlignment = TFI.getStackAlignment();
3937 uint64_t AlignMask = StackAlignment - 1;
3938 int64_t Offset = StackSize;
3939 unsigned SlotSize = RegInfo->getSlotSize();
3940 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3941 // Number smaller than 12 so just add the difference.
3942 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3944 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3945 Offset = ((~AlignMask) & Offset) + StackAlignment +
3946 (StackAlignment-SlotSize);
3951 /// Return true if the given stack call argument is already available in the
3952 /// same position (relatively) of the caller's incoming argument stack.
3954 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3955 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3956 const X86InstrInfo *TII, const CCValAssign &VA) {
3957 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3960 // Look through nodes that don't alter the bits of the incoming value.
3961 unsigned Op = Arg.getOpcode();
3962 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3963 Arg = Arg.getOperand(0);
3966 if (Op == ISD::TRUNCATE) {
3967 const SDValue &TruncInput = Arg.getOperand(0);
3968 if (TruncInput.getOpcode() == ISD::AssertZext &&
3969 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3970 Arg.getValueType()) {
3971 Arg = TruncInput.getOperand(0);
3979 if (Arg.getOpcode() == ISD::CopyFromReg) {
3980 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3981 if (!TargetRegisterInfo::isVirtualRegister(VR))
3983 MachineInstr *Def = MRI->getVRegDef(VR);
3986 if (!Flags.isByVal()) {
3987 if (!TII->isLoadFromStackSlot(*Def, FI))
3990 unsigned Opcode = Def->getOpcode();
3991 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3992 Opcode == X86::LEA64_32r) &&
3993 Def->getOperand(1).isFI()) {
3994 FI = Def->getOperand(1).getIndex();
3995 Bytes = Flags.getByValSize();
3999 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4000 if (Flags.isByVal())
4001 // ByVal argument is passed in as a pointer but it's now being
4002 // dereferenced. e.g.
4003 // define @foo(%struct.X* %A) {
4004 // tail call @bar(%struct.X* byval %A)
4007 SDValue Ptr = Ld->getBasePtr();
4008 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4011 FI = FINode->getIndex();
4012 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4013 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4014 FI = FINode->getIndex();
4015 Bytes = Flags.getByValSize();
4019 assert(FI != INT_MAX);
4020 if (!MFI.isFixedObjectIndex(FI))
4023 if (Offset != MFI.getObjectOffset(FI))
4026 // If this is not byval, check that the argument stack object is immutable.
4027 // inalloca and argument copy elision can create mutable argument stack
4028 // objects. Byval objects can be mutated, but a byval call intends to pass the
4030 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4033 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4034 // If the argument location is wider than the argument type, check that any
4035 // extension flags match.
4036 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4037 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4042 return Bytes == MFI.getObjectSize(FI);
4045 /// Check whether the call is eligible for tail call optimization. Targets
4046 /// that want to do tail call optimization should implement this function.
4047 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4048 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4049 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4050 const SmallVectorImpl<ISD::OutputArg> &Outs,
4051 const SmallVectorImpl<SDValue> &OutVals,
4052 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4053 if (!mayTailCallThisCC(CalleeCC))
4056 // If -tailcallopt is specified, make fastcc functions tail-callable.
4057 MachineFunction &MF = DAG.getMachineFunction();
4058 const Function &CallerF = MF.getFunction();
4060 // If the function return type is x86_fp80 and the callee return type is not,
4061 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4062 // perform a tailcall optimization here.
4063 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4066 CallingConv::ID CallerCC = CallerF.getCallingConv();
4067 bool CCMatch = CallerCC == CalleeCC;
4068 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4069 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4071 // Win64 functions have extra shadow space for argument homing. Don't do the
4072 // sibcall if the caller and callee have mismatched expectations for this
4074 if (IsCalleeWin64 != IsCallerWin64)
4077 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4078 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4083 // Look for obvious safe cases to perform tail call optimization that do not
4084 // require ABI changes. This is what gcc calls sibcall.
4086 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4087 // emit a special epilogue.
4088 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4089 if (RegInfo->needsStackRealignment(MF))
4092 // Also avoid sibcall optimization if either caller or callee uses struct
4093 // return semantics.
4094 if (isCalleeStructRet || isCallerStructRet)
4097 // Do not sibcall optimize vararg calls unless all arguments are passed via
4099 LLVMContext &C = *DAG.getContext();
4100 if (isVarArg && !Outs.empty()) {
4101 // Optimizing for varargs on Win64 is unlikely to be safe without
4102 // additional testing.
4103 if (IsCalleeWin64 || IsCallerWin64)
4106 SmallVector<CCValAssign, 16> ArgLocs;
4107 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4109 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4110 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4111 if (!ArgLocs[i].isRegLoc())
4115 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4116 // stack. Therefore, if it's not used by the call it is not safe to optimize
4117 // this into a sibcall.
4118 bool Unused = false;
4119 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4126 SmallVector<CCValAssign, 16> RVLocs;
4127 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4128 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4129 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4130 CCValAssign &VA = RVLocs[i];
4131 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4136 // Check that the call results are passed in the same way.
4137 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4138 RetCC_X86, RetCC_X86))
4140 // The callee has to preserve all registers the caller needs to preserve.
4141 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4142 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4144 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4145 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4149 unsigned StackArgsSize = 0;
4151 // If the callee takes no arguments then go on to check the results of the
4153 if (!Outs.empty()) {
4154 // Check if stack adjustment is needed. For now, do not do this if any
4155 // argument is passed on the stack.
4156 SmallVector<CCValAssign, 16> ArgLocs;
4157 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4159 // Allocate shadow area for Win64
4161 CCInfo.AllocateStack(32, 8);
4163 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4164 StackArgsSize = CCInfo.getNextStackOffset();
4166 if (CCInfo.getNextStackOffset()) {
4167 // Check if the arguments are already laid out in the right way as
4168 // the caller's fixed stack objects.
4169 MachineFrameInfo &MFI = MF.getFrameInfo();
4170 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4171 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4172 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4173 CCValAssign &VA = ArgLocs[i];
4174 SDValue Arg = OutVals[i];
4175 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4176 if (VA.getLocInfo() == CCValAssign::Indirect)
4178 if (!VA.isRegLoc()) {
4179 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4186 bool PositionIndependent = isPositionIndependent();
4187 // If the tailcall address may be in a register, then make sure it's
4188 // possible to register allocate for it. In 32-bit, the call address can
4189 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4190 // callee-saved registers are restored. These happen to be the same
4191 // registers used to pass 'inreg' arguments so watch out for those.
4192 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4193 !isa<ExternalSymbolSDNode>(Callee)) ||
4194 PositionIndependent)) {
4195 unsigned NumInRegs = 0;
4196 // In PIC we need an extra register to formulate the address computation
4198 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4200 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4201 CCValAssign &VA = ArgLocs[i];
4204 unsigned Reg = VA.getLocReg();
4207 case X86::EAX: case X86::EDX: case X86::ECX:
4208 if (++NumInRegs == MaxInRegs)
4215 const MachineRegisterInfo &MRI = MF.getRegInfo();
4216 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4220 bool CalleeWillPop =
4221 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4222 MF.getTarget().Options.GuaranteedTailCallOpt);
4224 if (unsigned BytesToPop =
4225 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4226 // If we have bytes to pop, the callee must pop them.
4227 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4228 if (!CalleePopMatches)
4230 } else if (CalleeWillPop && StackArgsSize > 0) {
4231 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4239 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4240 const TargetLibraryInfo *libInfo) const {
4241 return X86::createFastISel(funcInfo, libInfo);
4244 //===----------------------------------------------------------------------===//
4245 // Other Lowering Hooks
4246 //===----------------------------------------------------------------------===//
4248 static bool MayFoldLoad(SDValue Op) {
4249 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4252 static bool MayFoldIntoStore(SDValue Op) {
4253 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4256 static bool MayFoldIntoZeroExtend(SDValue Op) {
4257 if (Op.hasOneUse()) {
4258 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4259 return (ISD::ZERO_EXTEND == Opcode);
4264 static bool isTargetShuffle(unsigned Opcode) {
4266 default: return false;
4267 case X86ISD::BLENDI:
4268 case X86ISD::PSHUFB:
4269 case X86ISD::PSHUFD:
4270 case X86ISD::PSHUFHW:
4271 case X86ISD::PSHUFLW:
4273 case X86ISD::INSERTPS:
4274 case X86ISD::EXTRQI:
4275 case X86ISD::INSERTQI:
4276 case X86ISD::PALIGNR:
4277 case X86ISD::VSHLDQ:
4278 case X86ISD::VSRLDQ:
4279 case X86ISD::MOVLHPS:
4280 case X86ISD::MOVHLPS:
4281 case X86ISD::MOVLPS:
4282 case X86ISD::MOVLPD:
4283 case X86ISD::MOVSHDUP:
4284 case X86ISD::MOVSLDUP:
4285 case X86ISD::MOVDDUP:
4288 case X86ISD::UNPCKL:
4289 case X86ISD::UNPCKH:
4290 case X86ISD::VBROADCAST:
4291 case X86ISD::VPERMILPI:
4292 case X86ISD::VPERMILPV:
4293 case X86ISD::VPERM2X128:
4294 case X86ISD::VPERMIL2:
4295 case X86ISD::VPERMI:
4296 case X86ISD::VPPERM:
4297 case X86ISD::VPERMV:
4298 case X86ISD::VPERMV3:
4299 case X86ISD::VPERMIV3:
4300 case X86ISD::VZEXT_MOVL:
4305 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4307 default: return false;
4309 case X86ISD::PSHUFB:
4310 case X86ISD::VPERMILPV:
4311 case X86ISD::VPERMIL2:
4312 case X86ISD::VPPERM:
4313 case X86ISD::VPERMV:
4314 case X86ISD::VPERMV3:
4315 case X86ISD::VPERMIV3:
4317 // 'Faux' Target Shuffles.
4324 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4325 MachineFunction &MF = DAG.getMachineFunction();
4326 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4327 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4328 int ReturnAddrIndex = FuncInfo->getRAIndex();
4330 if (ReturnAddrIndex == 0) {
4331 // Set up a frame object for the return address.
4332 unsigned SlotSize = RegInfo->getSlotSize();
4333 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4336 FuncInfo->setRAIndex(ReturnAddrIndex);
4339 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4342 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4343 bool hasSymbolicDisplacement) {
4344 // Offset should fit into 32 bit immediate field.
4345 if (!isInt<32>(Offset))
4348 // If we don't have a symbolic displacement - we don't have any extra
4350 if (!hasSymbolicDisplacement)
4353 // FIXME: Some tweaks might be needed for medium code model.
4354 if (M != CodeModel::Small && M != CodeModel::Kernel)
4357 // For small code model we assume that latest object is 16MB before end of 31
4358 // bits boundary. We may also accept pretty large negative constants knowing
4359 // that all objects are in the positive half of address space.
4360 if (M == CodeModel::Small && Offset < 16*1024*1024)
4363 // For kernel code model we know that all object resist in the negative half
4364 // of 32bits address space. We may not accept negative offsets, since they may
4365 // be just off and we may accept pretty large positive ones.
4366 if (M == CodeModel::Kernel && Offset >= 0)
4372 /// Determines whether the callee is required to pop its own arguments.
4373 /// Callee pop is necessary to support tail calls.
4374 bool X86::isCalleePop(CallingConv::ID CallingConv,
4375 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4376 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4377 // can guarantee TCO.
4378 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4381 switch (CallingConv) {
4384 case CallingConv::X86_StdCall:
4385 case CallingConv::X86_FastCall:
4386 case CallingConv::X86_ThisCall:
4387 case CallingConv::X86_VectorCall:
4392 /// \brief Return true if the condition is an unsigned comparison operation.
4393 static bool isX86CCUnsigned(unsigned X86CC) {
4396 llvm_unreachable("Invalid integer condition!");
4412 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4413 switch (SetCCOpcode) {
4414 default: llvm_unreachable("Invalid integer condition!");
4415 case ISD::SETEQ: return X86::COND_E;
4416 case ISD::SETGT: return X86::COND_G;
4417 case ISD::SETGE: return X86::COND_GE;
4418 case ISD::SETLT: return X86::COND_L;
4419 case ISD::SETLE: return X86::COND_LE;
4420 case ISD::SETNE: return X86::COND_NE;
4421 case ISD::SETULT: return X86::COND_B;
4422 case ISD::SETUGT: return X86::COND_A;
4423 case ISD::SETULE: return X86::COND_BE;
4424 case ISD::SETUGE: return X86::COND_AE;
4428 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4429 /// condition code, returning the condition code and the LHS/RHS of the
4430 /// comparison to make.
4431 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4432 bool isFP, SDValue &LHS, SDValue &RHS,
4433 SelectionDAG &DAG) {
4435 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4436 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4437 // X > -1 -> X == 0, jump !sign.
4438 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4439 return X86::COND_NS;
4441 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4442 // X < 0 -> X == 0, jump on sign.
4445 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4447 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4448 return X86::COND_LE;
4452 return TranslateIntegerX86CC(SetCCOpcode);
4455 // First determine if it is required or is profitable to flip the operands.
4457 // If LHS is a foldable load, but RHS is not, flip the condition.
4458 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4459 !ISD::isNON_EXTLoad(RHS.getNode())) {
4460 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4461 std::swap(LHS, RHS);
4464 switch (SetCCOpcode) {
4470 std::swap(LHS, RHS);
4474 // On a floating point condition, the flags are set as follows:
4476 // 0 | 0 | 0 | X > Y
4477 // 0 | 0 | 1 | X < Y
4478 // 1 | 0 | 0 | X == Y
4479 // 1 | 1 | 1 | unordered
4480 switch (SetCCOpcode) {
4481 default: llvm_unreachable("Condcode should be pre-legalized away");
4483 case ISD::SETEQ: return X86::COND_E;
4484 case ISD::SETOLT: // flipped
4486 case ISD::SETGT: return X86::COND_A;
4487 case ISD::SETOLE: // flipped
4489 case ISD::SETGE: return X86::COND_AE;
4490 case ISD::SETUGT: // flipped
4492 case ISD::SETLT: return X86::COND_B;
4493 case ISD::SETUGE: // flipped
4495 case ISD::SETLE: return X86::COND_BE;
4497 case ISD::SETNE: return X86::COND_NE;
4498 case ISD::SETUO: return X86::COND_P;
4499 case ISD::SETO: return X86::COND_NP;
4501 case ISD::SETUNE: return X86::COND_INVALID;
4505 /// Is there a floating point cmov for the specific X86 condition code?
4506 /// Current x86 isa includes the following FP cmov instructions:
4507 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4508 static bool hasFPCMov(unsigned X86CC) {
4525 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4527 MachineFunction &MF,
4528 unsigned Intrinsic) const {
4530 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4534 Info.opc = ISD::INTRINSIC_W_CHAIN;
4535 Info.flags = MachineMemOperand::MONone;
4538 switch (IntrData->Type) {
4539 case EXPAND_FROM_MEM: {
4540 Info.ptrVal = I.getArgOperand(0);
4541 Info.memVT = MVT::getVT(I.getType());
4543 Info.flags |= MachineMemOperand::MOLoad;
4546 case COMPRESS_TO_MEM: {
4547 Info.ptrVal = I.getArgOperand(0);
4548 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4550 Info.flags |= MachineMemOperand::MOStore;
4553 case TRUNCATE_TO_MEM_VI8:
4554 case TRUNCATE_TO_MEM_VI16:
4555 case TRUNCATE_TO_MEM_VI32: {
4556 Info.ptrVal = I.getArgOperand(0);
4557 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4558 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4559 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4561 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4562 ScalarVT = MVT::i16;
4563 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4564 ScalarVT = MVT::i32;
4566 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4568 Info.flags |= MachineMemOperand::MOStore;
4578 /// Returns true if the target can instruction select the
4579 /// specified FP immediate natively. If false, the legalizer will
4580 /// materialize the FP immediate as a load from a constant pool.
4581 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4582 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4583 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4589 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4590 ISD::LoadExtType ExtTy,
4592 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4593 // relocation target a movq or addq instruction: don't let the load shrink.
4594 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4595 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4596 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4597 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4601 /// \brief Returns true if it is beneficial to convert a load of a constant
4602 /// to just the constant itself.
4603 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4605 assert(Ty->isIntegerTy());
4607 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4608 if (BitSize == 0 || BitSize > 64)
4613 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4614 // TODO: It might be a win to ease or lift this restriction, but the generic
4615 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4616 if (VT.isVector() && Subtarget.hasAVX512())
4622 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4623 unsigned Index) const {
4624 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4627 // Mask vectors support all subregister combinations and operations that
4628 // extract half of vector.
4629 if (ResVT.getVectorElementType() == MVT::i1)
4630 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4631 (Index == ResVT.getVectorNumElements()));
4633 return (Index % ResVT.getVectorNumElements()) == 0;
4636 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4637 // Speculate cttz only if we can directly use TZCNT.
4638 return Subtarget.hasBMI();
4641 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4642 // Speculate ctlz only if we can directly use LZCNT.
4643 return Subtarget.hasLZCNT();
4646 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
4647 EVT BitcastVT) const {
4648 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
4651 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
4654 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4655 const SelectionDAG &DAG) const {
4656 // Do not merge to float value size (128 bytes) if no implicit
4657 // float attribute is set.
4658 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
4659 Attribute::NoImplicitFloat);
4662 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4663 return (MemVT.getSizeInBits() <= MaxIntSize);
4668 bool X86TargetLowering::isCtlzFast() const {
4669 return Subtarget.hasFastLZCNT();
4672 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4673 const Instruction &AndI) const {
4677 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4678 if (!Subtarget.hasBMI())
4681 // There are only 32-bit and 64-bit forms for 'andn'.
4682 EVT VT = Y.getValueType();
4683 if (VT != MVT::i32 && VT != MVT::i64)
4689 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4690 MVT VT = MVT::getIntegerVT(NumBits);
4691 if (isTypeLegal(VT))
4694 // PMOVMSKB can handle this.
4695 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4698 // VPMOVMSKB can handle this.
4699 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4702 // TODO: Allow 64-bit type for 32-bit target.
4703 // TODO: 512-bit types should be allowed, but make sure that those
4704 // cases are handled in combineVectorSizedSetCCEquality().
4706 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4709 /// Val is the undef sentinel value or equal to the specified value.
4710 static bool isUndefOrEqual(int Val, int CmpVal) {
4711 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4714 /// Val is either the undef or zero sentinel value.
4715 static bool isUndefOrZero(int Val) {
4716 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4719 /// Return true if every element in Mask, beginning
4720 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4721 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4722 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4723 if (Mask[i] != SM_SentinelUndef)
4728 /// Return true if Val is undef or if its value falls within the
4729 /// specified range (L, H].
4730 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4731 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4734 /// Return true if every element in Mask is undef or if its value
4735 /// falls within the specified range (L, H].
4736 static bool isUndefOrInRange(ArrayRef<int> Mask,
4739 if (!isUndefOrInRange(M, Low, Hi))
4744 /// Return true if Val is undef, zero or if its value falls within the
4745 /// specified range (L, H].
4746 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4747 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4750 /// Return true if every element in Mask is undef, zero or if its value
4751 /// falls within the specified range (L, H].
4752 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4754 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4759 /// Return true if every element in Mask, beginning
4760 /// from position Pos and ending in Pos+Size, falls within the specified
4761 /// sequential range (Low, Low+Size]. or is undef.
4762 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4763 unsigned Pos, unsigned Size, int Low) {
4764 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4765 if (!isUndefOrEqual(Mask[i], Low))
4770 /// Return true if every element in Mask, beginning
4771 /// from position Pos and ending in Pos+Size, falls within the specified
4772 /// sequential range (Low, Low+Size], or is undef or is zero.
4773 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4774 unsigned Size, int Low) {
4775 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4776 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4781 /// Return true if every element in Mask, beginning
4782 /// from position Pos and ending in Pos+Size is undef or is zero.
4783 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4785 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4786 if (!isUndefOrZero(Mask[i]))
4791 /// \brief Helper function to test whether a shuffle mask could be
4792 /// simplified by widening the elements being shuffled.
4794 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4795 /// leaves it in an unspecified state.
4797 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4798 /// shuffle masks. The latter have the special property of a '-2' representing
4799 /// a zero-ed lane of a vector.
4800 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4801 SmallVectorImpl<int> &WidenedMask) {
4802 WidenedMask.assign(Mask.size() / 2, 0);
4803 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4805 int M1 = Mask[i + 1];
4807 // If both elements are undef, its trivial.
4808 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4809 WidenedMask[i / 2] = SM_SentinelUndef;
4813 // Check for an undef mask and a mask value properly aligned to fit with
4814 // a pair of values. If we find such a case, use the non-undef mask's value.
4815 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4816 WidenedMask[i / 2] = M1 / 2;
4819 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4820 WidenedMask[i / 2] = M0 / 2;
4824 // When zeroing, we need to spread the zeroing across both lanes to widen.
4825 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4826 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4827 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4828 WidenedMask[i / 2] = SM_SentinelZero;
4834 // Finally check if the two mask values are adjacent and aligned with
4836 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4837 WidenedMask[i / 2] = M0 / 2;
4841 // Otherwise we can't safely widen the elements used in this shuffle.
4844 assert(WidenedMask.size() == Mask.size() / 2 &&
4845 "Incorrect size of mask after widening the elements!");
4850 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4851 bool X86::isZeroNode(SDValue Elt) {
4852 return isNullConstant(Elt) || isNullFPConstant(Elt);
4855 // Build a vector of constants.
4856 // Use an UNDEF node if MaskElt == -1.
4857 // Split 64-bit constants in the 32-bit mode.
4858 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4859 const SDLoc &dl, bool IsMask = false) {
4861 SmallVector<SDValue, 32> Ops;
4864 MVT ConstVecVT = VT;
4865 unsigned NumElts = VT.getVectorNumElements();
4866 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4867 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4868 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4872 MVT EltVT = ConstVecVT.getVectorElementType();
4873 for (unsigned i = 0; i < NumElts; ++i) {
4874 bool IsUndef = Values[i] < 0 && IsMask;
4875 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4876 DAG.getConstant(Values[i], dl, EltVT);
4877 Ops.push_back(OpNode);
4879 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4880 DAG.getConstant(0, dl, EltVT));
4882 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4884 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4888 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4889 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4890 assert(Bits.size() == Undefs.getBitWidth() &&
4891 "Unequal constant and undef arrays");
4892 SmallVector<SDValue, 32> Ops;
4895 MVT ConstVecVT = VT;
4896 unsigned NumElts = VT.getVectorNumElements();
4897 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4898 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4899 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4903 MVT EltVT = ConstVecVT.getVectorElementType();
4904 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4906 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4909 const APInt &V = Bits[i];
4910 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4912 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4913 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4914 } else if (EltVT == MVT::f32) {
4915 APFloat FV(APFloat::IEEEsingle(), V);
4916 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4917 } else if (EltVT == MVT::f64) {
4918 APFloat FV(APFloat::IEEEdouble(), V);
4919 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4921 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4925 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4926 return DAG.getBitcast(VT, ConstsNode);
4929 /// Returns a vector of specified type with all zero elements.
4930 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4931 SelectionDAG &DAG, const SDLoc &dl) {
4932 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4933 VT.getVectorElementType() == MVT::i1) &&
4934 "Unexpected vector type");
4936 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4937 // type. This ensures they get CSE'd. But if the integer type is not
4938 // available, use a floating-point +0.0 instead.
4940 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4941 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4942 } else if (VT.getVectorElementType() == MVT::i1) {
4943 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4944 "Unexpected vector type");
4945 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4946 "Unexpected vector type");
4947 Vec = DAG.getConstant(0, dl, VT);
4949 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4950 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4952 return DAG.getBitcast(VT, Vec);
4955 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4956 const SDLoc &dl, unsigned vectorWidth) {
4957 EVT VT = Vec.getValueType();
4958 EVT ElVT = VT.getVectorElementType();
4959 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4960 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4961 VT.getVectorNumElements()/Factor);
4963 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4964 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4965 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4967 // This is the index of the first element of the vectorWidth-bit chunk
4968 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4969 IdxVal &= ~(ElemsPerChunk - 1);
4971 // If the input is a buildvector just emit a smaller one.
4972 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4973 return DAG.getBuildVector(ResultVT, dl,
4974 Vec->ops().slice(IdxVal, ElemsPerChunk));
4976 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4977 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4980 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4981 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4982 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4983 /// instructions or a simple subregister reference. Idx is an index in the
4984 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4985 /// lowering EXTRACT_VECTOR_ELT operations easier.
4986 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4987 SelectionDAG &DAG, const SDLoc &dl) {
4988 assert((Vec.getValueType().is256BitVector() ||
4989 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4990 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4993 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4994 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4995 SelectionDAG &DAG, const SDLoc &dl) {
4996 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4997 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5000 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5001 SelectionDAG &DAG, const SDLoc &dl,
5002 unsigned vectorWidth) {
5003 assert((vectorWidth == 128 || vectorWidth == 256) &&
5004 "Unsupported vector width");
5005 // Inserting UNDEF is Result
5008 EVT VT = Vec.getValueType();
5009 EVT ElVT = VT.getVectorElementType();
5010 EVT ResultVT = Result.getValueType();
5012 // Insert the relevant vectorWidth bits.
5013 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5014 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5016 // This is the index of the first element of the vectorWidth-bit chunk
5017 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5018 IdxVal &= ~(ElemsPerChunk - 1);
5020 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5021 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5024 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5025 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5026 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5027 /// simple superregister reference. Idx is an index in the 128 bits
5028 /// we want. It need not be aligned to a 128-bit boundary. That makes
5029 /// lowering INSERT_VECTOR_ELT operations easier.
5030 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5031 SelectionDAG &DAG, const SDLoc &dl) {
5032 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5033 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5036 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5037 SelectionDAG &DAG, const SDLoc &dl) {
5038 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5039 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5042 // Return true if the instruction zeroes the unused upper part of the
5043 // destination and accepts mask.
5044 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5049 case X86ISD::TESTNM:
5050 case X86ISD::PCMPEQM:
5051 case X86ISD::PCMPGTM:
5054 case X86ISD::CMPM_RND:
5059 /// Insert i1-subvector to i1-vector.
5060 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5061 const X86Subtarget &Subtarget) {
5064 SDValue Vec = Op.getOperand(0);
5065 SDValue SubVec = Op.getOperand(1);
5066 SDValue Idx = Op.getOperand(2);
5068 if (!isa<ConstantSDNode>(Idx))
5071 // Inserting undef is a nop. We can just return the original vector.
5072 if (SubVec.isUndef())
5075 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5076 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5079 MVT OpVT = Op.getSimpleValueType();
5080 unsigned NumElems = OpVT.getVectorNumElements();
5082 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5084 // Extend to natively supported kshift.
5085 MVT WideOpVT = OpVT;
5086 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5087 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5089 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5091 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5092 // May need to promote to a legal type.
5093 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5094 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5096 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5099 MVT SubVecVT = SubVec.getSimpleValueType();
5100 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5102 assert(IdxVal + SubVecNumElems <= NumElems &&
5103 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5104 "Unexpected index value in INSERT_SUBVECTOR");
5106 SDValue Undef = DAG.getUNDEF(WideOpVT);
5109 // Zero lower bits of the Vec
5110 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5111 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5113 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5114 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5115 // Merge them together, SubVec should be zero extended.
5116 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5117 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5119 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5120 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5123 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5124 Undef, SubVec, ZeroIdx);
5126 if (Vec.isUndef()) {
5127 assert(IdxVal != 0 && "Unexpected index");
5128 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5129 DAG.getConstant(IdxVal, dl, MVT::i8));
5130 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5133 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5134 assert(IdxVal != 0 && "Unexpected index");
5135 NumElems = WideOpVT.getVectorNumElements();
5136 unsigned ShiftLeft = NumElems - SubVecNumElems;
5137 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5138 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5139 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5140 if (ShiftRight != 0)
5141 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5142 DAG.getConstant(ShiftRight, dl, MVT::i8));
5143 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5146 // Simple case when we put subvector in the upper part
5147 if (IdxVal + SubVecNumElems == NumElems) {
5148 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5149 DAG.getConstant(IdxVal, dl, MVT::i8));
5150 if (SubVecNumElems * 2 == NumElems) {
5151 // Special case, use legal zero extending insert_subvector. This allows
5152 // isel to opimitize when bits are known zero.
5153 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5154 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5155 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5158 // Otherwise use explicit shifts to zero the bits.
5159 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5160 Undef, Vec, ZeroIdx);
5161 NumElems = WideOpVT.getVectorNumElements();
5162 SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
5163 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5164 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5166 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5167 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5170 // Inserting into the middle is more complicated.
5172 NumElems = WideOpVT.getVectorNumElements();
5174 // Widen the vector if needed.
5175 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5176 // Move the current value of the bit to be replace to the lsbs.
5177 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5178 DAG.getConstant(IdxVal, dl, MVT::i8));
5179 // Xor with the new bit.
5180 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
5181 // Shift to MSB, filling bottom bits with 0.
5182 unsigned ShiftLeft = NumElems - SubVecNumElems;
5183 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
5184 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5185 // Shift to the final position, filling upper bits with 0.
5186 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5187 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5188 DAG.getConstant(ShiftRight, dl, MVT::i8));
5189 // Xor with original vector leaving the new value.
5190 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
5191 // Reduce to original width if needed.
5192 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5195 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5196 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5197 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5198 /// large BUILD_VECTORS.
5199 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5200 unsigned NumElems, SelectionDAG &DAG,
5202 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5203 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5206 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5207 unsigned NumElems, SelectionDAG &DAG,
5209 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5210 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5213 /// Returns a vector of specified type with all bits set.
5214 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5215 /// Then bitcast to their original type, ensuring they get CSE'd.
5216 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5217 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5218 "Expected a 128/256/512-bit vector type");
5220 APInt Ones = APInt::getAllOnesValue(32);
5221 unsigned NumElts = VT.getSizeInBits() / 32;
5222 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5223 return DAG.getBitcast(VT, Vec);
5226 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5227 SelectionDAG &DAG) {
5228 EVT InVT = In.getValueType();
5229 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5231 if (VT.is128BitVector() && InVT.is128BitVector())
5232 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5233 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5235 // For 256-bit vectors, we only need the lower (128-bit) input half.
5236 // For 512-bit vectors, we only need the lower input half or quarter.
5237 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5238 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5239 In = extractSubVector(In, 0, DAG, DL,
5240 std::max(128, (int)VT.getSizeInBits() / Scale));
5243 return DAG.getNode(Opc, DL, VT, In);
5246 /// Returns a vector_shuffle node for an unpackl operation.
5247 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5248 SDValue V1, SDValue V2) {
5249 SmallVector<int, 8> Mask;
5250 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5251 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5254 /// Returns a vector_shuffle node for an unpackh operation.
5255 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5256 SDValue V1, SDValue V2) {
5257 SmallVector<int, 8> Mask;
5258 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5259 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5262 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5263 /// This produces a shuffle where the low element of V2 is swizzled into the
5264 /// zero/undef vector, landing at element Idx.
5265 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5266 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5268 const X86Subtarget &Subtarget,
5269 SelectionDAG &DAG) {
5270 MVT VT = V2.getSimpleValueType();
5272 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5273 int NumElems = VT.getVectorNumElements();
5274 SmallVector<int, 16> MaskVec(NumElems);
5275 for (int i = 0; i != NumElems; ++i)
5276 // If this is the insertion idx, put the low elt of V2 here.
5277 MaskVec[i] = (i == Idx) ? NumElems : i;
5278 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5281 static SDValue peekThroughBitcasts(SDValue V) {
5282 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5283 V = V.getOperand(0);
5287 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5288 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5289 V.getOperand(0).hasOneUse())
5290 V = V.getOperand(0);
5294 static const Constant *getTargetConstantFromNode(SDValue Op) {
5295 Op = peekThroughBitcasts(Op);
5297 auto *Load = dyn_cast<LoadSDNode>(Op);
5301 SDValue Ptr = Load->getBasePtr();
5302 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5303 Ptr->getOpcode() == X86ISD::WrapperRIP)
5304 Ptr = Ptr->getOperand(0);
5306 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5307 if (!CNode || CNode->isMachineConstantPoolEntry())
5310 return dyn_cast<Constant>(CNode->getConstVal());
5313 // Extract raw constant bits from constant pools.
5314 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5316 SmallVectorImpl<APInt> &EltBits,
5317 bool AllowWholeUndefs = true,
5318 bool AllowPartialUndefs = true) {
5319 assert(EltBits.empty() && "Expected an empty EltBits vector");
5321 Op = peekThroughBitcasts(Op);
5323 EVT VT = Op.getValueType();
5324 unsigned SizeInBits = VT.getSizeInBits();
5325 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5326 unsigned NumElts = SizeInBits / EltSizeInBits;
5328 // Bitcast a source array of element bits to the target size.
5329 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5330 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5331 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5332 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5333 "Constant bit sizes don't match");
5335 // Don't split if we don't allow undef bits.
5336 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5337 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5340 // If we're already the right size, don't bother bitcasting.
5341 if (NumSrcElts == NumElts) {
5342 UndefElts = UndefSrcElts;
5343 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5347 // Extract all the undef/constant element data and pack into single bitsets.
5348 APInt UndefBits(SizeInBits, 0);
5349 APInt MaskBits(SizeInBits, 0);
5351 for (unsigned i = 0; i != NumSrcElts; ++i) {
5352 unsigned BitOffset = i * SrcEltSizeInBits;
5353 if (UndefSrcElts[i])
5354 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5355 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5358 // Split the undef/constant single bitset data into the target elements.
5359 UndefElts = APInt(NumElts, 0);
5360 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5362 for (unsigned i = 0; i != NumElts; ++i) {
5363 unsigned BitOffset = i * EltSizeInBits;
5364 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5366 // Only treat an element as UNDEF if all bits are UNDEF.
5367 if (UndefEltBits.isAllOnesValue()) {
5368 if (!AllowWholeUndefs)
5370 UndefElts.setBit(i);
5374 // If only some bits are UNDEF then treat them as zero (or bail if not
5376 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5379 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5380 EltBits[i] = Bits.getZExtValue();
5385 // Collect constant bits and insert into mask/undef bit masks.
5386 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5387 unsigned UndefBitIndex) {
5390 if (isa<UndefValue>(Cst)) {
5391 Undefs.setBit(UndefBitIndex);
5394 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5395 Mask = CInt->getValue();
5398 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5399 Mask = CFP->getValueAPF().bitcastToAPInt();
5407 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5408 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5409 return CastBitData(UndefSrcElts, SrcEltBits);
5412 // Extract scalar constant bits.
5413 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5414 APInt UndefSrcElts = APInt::getNullValue(1);
5415 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5416 return CastBitData(UndefSrcElts, SrcEltBits);
5419 // Extract constant bits from build vector.
5420 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5421 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5422 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5424 APInt UndefSrcElts(NumSrcElts, 0);
5425 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5426 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5427 const SDValue &Src = Op.getOperand(i);
5428 if (Src.isUndef()) {
5429 UndefSrcElts.setBit(i);
5432 auto *Cst = cast<ConstantSDNode>(Src);
5433 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5435 return CastBitData(UndefSrcElts, SrcEltBits);
5438 // Extract constant bits from constant pool vector.
5439 if (auto *Cst = getTargetConstantFromNode(Op)) {
5440 Type *CstTy = Cst->getType();
5441 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5444 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5445 unsigned NumSrcElts = CstTy->getVectorNumElements();
5447 APInt UndefSrcElts(NumSrcElts, 0);
5448 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5449 for (unsigned i = 0; i != NumSrcElts; ++i)
5450 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5454 return CastBitData(UndefSrcElts, SrcEltBits);
5457 // Extract constant bits from a broadcasted constant pool scalar.
5458 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5459 EltSizeInBits <= VT.getScalarSizeInBits()) {
5460 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5461 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5462 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5464 APInt UndefSrcElts(NumSrcElts, 0);
5465 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5466 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5467 if (UndefSrcElts[0])
5468 UndefSrcElts.setBits(0, NumSrcElts);
5469 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5470 return CastBitData(UndefSrcElts, SrcEltBits);
5475 // Extract a rematerialized scalar constant insertion.
5476 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5477 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5478 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5479 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5480 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5482 APInt UndefSrcElts(NumSrcElts, 0);
5483 SmallVector<APInt, 64> SrcEltBits;
5484 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5485 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5486 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5487 return CastBitData(UndefSrcElts, SrcEltBits);
5493 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5494 unsigned MaskEltSizeInBits,
5495 SmallVectorImpl<uint64_t> &RawMask) {
5497 SmallVector<APInt, 64> EltBits;
5499 // Extract the raw target constant bits.
5500 // FIXME: We currently don't support UNDEF bits or mask entries.
5501 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5502 EltBits, /* AllowWholeUndefs */ false,
5503 /* AllowPartialUndefs */ false))
5506 // Insert the extracted elements into the mask.
5507 for (APInt Elt : EltBits)
5508 RawMask.push_back(Elt.getZExtValue());
5513 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5514 /// Note: This ignores saturation, so inputs must be checked first.
5515 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
5517 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5518 unsigned NumElts = VT.getVectorNumElements();
5519 unsigned NumLanes = VT.getSizeInBits() / 128;
5520 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5521 unsigned Offset = Unary ? 0 : NumElts;
5523 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5524 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5525 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5526 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5527 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5531 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5532 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5533 /// operands in \p Ops, and returns true.
5534 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5535 /// IsUnary for shuffles which use a single input multiple times, and in those
5536 /// cases it will adjust the mask to only have indices within that single input.
5537 /// It is an error to call this with non-empty Mask/Ops vectors.
5538 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5539 SmallVectorImpl<SDValue> &Ops,
5540 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5541 unsigned NumElems = VT.getVectorNumElements();
5544 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5545 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5548 bool IsFakeUnary = false;
5549 switch(N->getOpcode()) {
5550 case X86ISD::BLENDI:
5551 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5552 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5553 ImmN = N->getOperand(N->getNumOperands()-1);
5554 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5555 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5558 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5559 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5560 ImmN = N->getOperand(N->getNumOperands()-1);
5561 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5562 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5564 case X86ISD::INSERTPS:
5565 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5566 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5567 ImmN = N->getOperand(N->getNumOperands()-1);
5568 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5569 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5571 case X86ISD::EXTRQI:
5572 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5573 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5574 isa<ConstantSDNode>(N->getOperand(2))) {
5575 int BitLen = N->getConstantOperandVal(1);
5576 int BitIdx = N->getConstantOperandVal(2);
5577 DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
5581 case X86ISD::INSERTQI:
5582 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5583 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5584 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5585 isa<ConstantSDNode>(N->getOperand(3))) {
5586 int BitLen = N->getConstantOperandVal(2);
5587 int BitIdx = N->getConstantOperandVal(3);
5588 DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
5589 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5592 case X86ISD::UNPCKH:
5593 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5594 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5595 DecodeUNPCKHMask(VT, Mask);
5596 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5598 case X86ISD::UNPCKL:
5599 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5600 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5601 DecodeUNPCKLMask(VT, Mask);
5602 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5604 case X86ISD::MOVHLPS:
5605 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5606 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5607 DecodeMOVHLPSMask(NumElems, Mask);
5608 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5610 case X86ISD::MOVLHPS:
5611 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5613 DecodeMOVLHPSMask(NumElems, Mask);
5614 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5616 case X86ISD::PALIGNR:
5617 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5618 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5619 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5620 ImmN = N->getOperand(N->getNumOperands()-1);
5621 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5622 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5623 Ops.push_back(N->getOperand(1));
5624 Ops.push_back(N->getOperand(0));
5626 case X86ISD::VSHLDQ:
5627 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5628 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5629 ImmN = N->getOperand(N->getNumOperands() - 1);
5630 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5633 case X86ISD::VSRLDQ:
5634 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5635 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5636 ImmN = N->getOperand(N->getNumOperands() - 1);
5637 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5640 case X86ISD::PSHUFD:
5641 case X86ISD::VPERMILPI:
5642 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5643 ImmN = N->getOperand(N->getNumOperands()-1);
5644 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5647 case X86ISD::PSHUFHW:
5648 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5649 ImmN = N->getOperand(N->getNumOperands()-1);
5650 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5653 case X86ISD::PSHUFLW:
5654 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5655 ImmN = N->getOperand(N->getNumOperands()-1);
5656 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5659 case X86ISD::VZEXT_MOVL:
5660 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5661 DecodeZeroMoveLowMask(VT, Mask);
5664 case X86ISD::VBROADCAST: {
5665 SDValue N0 = N->getOperand(0);
5666 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5667 // add the pre-extracted value to the Ops vector.
5668 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5669 N0.getOperand(0).getValueType() == VT &&
5670 N0.getConstantOperandVal(1) == 0)
5671 Ops.push_back(N0.getOperand(0));
5673 // We only decode broadcasts of same-sized vectors, unless the broadcast
5674 // came from an extract from the original width. If we found one, we
5675 // pushed it the Ops vector above.
5676 if (N0.getValueType() == VT || !Ops.empty()) {
5677 DecodeVectorBroadcast(VT, Mask);
5683 case X86ISD::VPERMILPV: {
5684 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5686 SDValue MaskNode = N->getOperand(1);
5687 unsigned MaskEltSize = VT.getScalarSizeInBits();
5688 SmallVector<uint64_t, 32> RawMask;
5689 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5690 DecodeVPERMILPMask(VT, RawMask, Mask);
5693 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5694 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5699 case X86ISD::PSHUFB: {
5700 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5701 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5702 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5704 SDValue MaskNode = N->getOperand(1);
5705 SmallVector<uint64_t, 32> RawMask;
5706 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5707 DecodePSHUFBMask(RawMask, Mask);
5710 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5711 DecodePSHUFBMask(C, Mask);
5716 case X86ISD::VPERMI:
5717 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5718 ImmN = N->getOperand(N->getNumOperands()-1);
5719 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5724 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5725 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5726 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5728 case X86ISD::VPERM2X128:
5729 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5730 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5731 ImmN = N->getOperand(N->getNumOperands()-1);
5732 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5733 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5735 case X86ISD::MOVSLDUP:
5736 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5737 DecodeMOVSLDUPMask(VT, Mask);
5740 case X86ISD::MOVSHDUP:
5741 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5742 DecodeMOVSHDUPMask(VT, Mask);
5745 case X86ISD::MOVDDUP:
5746 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5747 DecodeMOVDDUPMask(VT, Mask);
5750 case X86ISD::MOVLPD:
5751 case X86ISD::MOVLPS:
5752 // Not yet implemented
5754 case X86ISD::VPERMIL2: {
5755 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5756 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5757 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5758 unsigned MaskEltSize = VT.getScalarSizeInBits();
5759 SDValue MaskNode = N->getOperand(2);
5760 SDValue CtrlNode = N->getOperand(3);
5761 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5762 unsigned CtrlImm = CtrlOp->getZExtValue();
5763 SmallVector<uint64_t, 32> RawMask;
5764 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5765 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5768 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5769 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5775 case X86ISD::VPPERM: {
5776 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5777 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5778 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5779 SDValue MaskNode = N->getOperand(2);
5780 SmallVector<uint64_t, 32> RawMask;
5781 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5782 DecodeVPPERMMask(RawMask, Mask);
5785 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5786 DecodeVPPERMMask(C, Mask);
5791 case X86ISD::VPERMV: {
5792 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5794 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5795 Ops.push_back(N->getOperand(1));
5796 SDValue MaskNode = N->getOperand(0);
5797 SmallVector<uint64_t, 32> RawMask;
5798 unsigned MaskEltSize = VT.getScalarSizeInBits();
5799 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5800 DecodeVPERMVMask(RawMask, Mask);
5803 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5804 DecodeVPERMVMask(C, MaskEltSize, Mask);
5809 case X86ISD::VPERMV3: {
5810 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5811 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5812 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5813 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5814 Ops.push_back(N->getOperand(0));
5815 Ops.push_back(N->getOperand(2));
5816 SDValue MaskNode = N->getOperand(1);
5817 unsigned MaskEltSize = VT.getScalarSizeInBits();
5818 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5819 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5824 case X86ISD::VPERMIV3: {
5825 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5826 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5827 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5828 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5829 Ops.push_back(N->getOperand(1));
5830 Ops.push_back(N->getOperand(2));
5831 SDValue MaskNode = N->getOperand(0);
5832 unsigned MaskEltSize = VT.getScalarSizeInBits();
5833 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5834 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5839 default: llvm_unreachable("unknown target shuffle node");
5842 // Empty mask indicates the decode failed.
5846 // Check if we're getting a shuffle mask with zero'd elements.
5847 if (!AllowSentinelZero)
5848 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5851 // If we have a fake unary shuffle, the shuffle mask is spread across two
5852 // inputs that are actually the same node. Re-map the mask to always point
5853 // into the first input.
5856 if (M >= (int)Mask.size())
5859 // If we didn't already add operands in the opcode-specific code, default to
5860 // adding 1 or 2 operands starting at 0.
5862 Ops.push_back(N->getOperand(0));
5863 if (!IsUnary || IsFakeUnary)
5864 Ops.push_back(N->getOperand(1));
5870 /// Check a target shuffle mask's inputs to see if we can set any values to
5871 /// SM_SentinelZero - this is for elements that are known to be zero
5872 /// (not just zeroable) from their inputs.
5873 /// Returns true if the target shuffle mask was decoded.
5874 static bool setTargetShuffleZeroElements(SDValue N,
5875 SmallVectorImpl<int> &Mask,
5876 SmallVectorImpl<SDValue> &Ops) {
5878 if (!isTargetShuffle(N.getOpcode()))
5881 MVT VT = N.getSimpleValueType();
5882 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5885 SDValue V1 = Ops[0];
5886 SDValue V2 = IsUnary ? V1 : Ops[1];
5888 V1 = peekThroughBitcasts(V1);
5889 V2 = peekThroughBitcasts(V2);
5891 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5892 "Illegal split of shuffle value type");
5893 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5895 // Extract known constant input data.
5896 APInt UndefSrcElts[2];
5897 SmallVector<APInt, 32> SrcEltBits[2];
5898 bool IsSrcConstant[2] = {
5899 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5900 SrcEltBits[0], true, false),
5901 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5902 SrcEltBits[1], true, false)};
5904 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5907 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5911 // Determine shuffle input and normalize the mask.
5912 unsigned SrcIdx = M / Size;
5913 SDValue V = M < Size ? V1 : V2;
5916 // We are referencing an UNDEF input.
5918 Mask[i] = SM_SentinelUndef;
5922 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5923 // TODO: We currently only set UNDEF for integer types - floats use the same
5924 // registers as vectors and many of the scalar folded loads rely on the
5925 // SCALAR_TO_VECTOR pattern.
5926 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5927 (Size % V.getValueType().getVectorNumElements()) == 0) {
5928 int Scale = Size / V.getValueType().getVectorNumElements();
5929 int Idx = M / Scale;
5930 if (Idx != 0 && !VT.isFloatingPoint())
5931 Mask[i] = SM_SentinelUndef;
5932 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5933 Mask[i] = SM_SentinelZero;
5937 // Attempt to extract from the source's constant bits.
5938 if (IsSrcConstant[SrcIdx]) {
5939 if (UndefSrcElts[SrcIdx][M])
5940 Mask[i] = SM_SentinelUndef;
5941 else if (SrcEltBits[SrcIdx][M] == 0)
5942 Mask[i] = SM_SentinelZero;
5946 assert(VT.getVectorNumElements() == Mask.size() &&
5947 "Different mask size from vector size!");
5951 // Attempt to decode ops that could be represented as a shuffle mask.
5952 // The decoded shuffle mask may contain a different number of elements to the
5953 // destination value type.
5954 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5955 SmallVectorImpl<SDValue> &Ops,
5956 SelectionDAG &DAG) {
5960 MVT VT = N.getSimpleValueType();
5961 unsigned NumElts = VT.getVectorNumElements();
5962 unsigned NumSizeInBits = VT.getSizeInBits();
5963 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5964 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5965 "Expected byte aligned value types");
5967 unsigned Opcode = N.getOpcode();
5970 case X86ISD::ANDNP: {
5971 // Attempt to decode as a per-byte mask.
5973 SmallVector<APInt, 32> EltBits;
5974 SDValue N0 = N.getOperand(0);
5975 SDValue N1 = N.getOperand(1);
5976 bool IsAndN = (X86ISD::ANDNP == Opcode);
5977 uint64_t ZeroMask = IsAndN ? 255 : 0;
5978 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5980 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5982 Mask.push_back(SM_SentinelUndef);
5985 uint64_t ByteBits = EltBits[i].getZExtValue();
5986 if (ByteBits != 0 && ByteBits != 255)
5988 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5990 Ops.push_back(IsAndN ? N1 : N0);
5993 case ISD::SCALAR_TO_VECTOR: {
5994 // Match against a scalar_to_vector of an extract from a vector,
5995 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
5996 SDValue N0 = N.getOperand(0);
5999 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6000 N0.getOperand(0).getValueType() == VT) ||
6001 (N0.getOpcode() == X86ISD::PEXTRW &&
6002 N0.getOperand(0).getValueType() == MVT::v8i16) ||
6003 (N0.getOpcode() == X86ISD::PEXTRB &&
6004 N0.getOperand(0).getValueType() == MVT::v16i8)) {
6008 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6011 SDValue SrcVec = SrcExtract.getOperand(0);
6012 EVT SrcVT = SrcVec.getValueType();
6013 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6014 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
6016 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6017 if (NumSrcElts <= SrcIdx)
6020 Ops.push_back(SrcVec);
6021 Mask.push_back(SrcIdx);
6022 Mask.append(NumZeros, SM_SentinelZero);
6023 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6026 case X86ISD::PINSRB:
6027 case X86ISD::PINSRW: {
6028 SDValue InVec = N.getOperand(0);
6029 SDValue InScl = N.getOperand(1);
6030 uint64_t InIdx = N.getConstantOperandVal(2);
6031 assert(InIdx < NumElts && "Illegal insertion index");
6033 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6034 if (X86::isZeroNode(InScl)) {
6035 Ops.push_back(InVec);
6036 for (unsigned i = 0; i != NumElts; ++i)
6037 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6041 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
6042 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6044 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6045 if (InScl.getOpcode() != ExOp)
6048 SDValue ExVec = InScl.getOperand(0);
6049 uint64_t ExIdx = InScl.getConstantOperandVal(1);
6050 assert(ExIdx < NumElts && "Illegal extraction index");
6051 Ops.push_back(InVec);
6052 Ops.push_back(ExVec);
6053 for (unsigned i = 0; i != NumElts; ++i)
6054 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6057 case X86ISD::PACKSS:
6058 case X86ISD::PACKUS: {
6059 SDValue N0 = N.getOperand(0);
6060 SDValue N1 = N.getOperand(1);
6061 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6062 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6063 "Unexpected input value type");
6065 // If we know input saturation won't happen we can treat this
6066 // as a truncation shuffle.
6067 if (Opcode == X86ISD::PACKSS) {
6068 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6069 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6072 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6073 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6074 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6078 bool IsUnary = (N0 == N1);
6084 createPackShuffleMask(VT, Mask, IsUnary);
6088 case X86ISD::VSRLI: {
6089 uint64_t ShiftVal = N.getConstantOperandVal(1);
6090 // Out of range bit shifts are guaranteed to be zero.
6091 if (NumBitsPerElt <= ShiftVal) {
6092 Mask.append(NumElts, SM_SentinelZero);
6096 // We can only decode 'whole byte' bit shifts as shuffles.
6097 if ((ShiftVal % 8) != 0)
6100 uint64_t ByteShift = ShiftVal / 8;
6101 unsigned NumBytes = NumSizeInBits / 8;
6102 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6103 Ops.push_back(N.getOperand(0));
6105 // Clear mask to all zeros and insert the shifted byte indices.
6106 Mask.append(NumBytes, SM_SentinelZero);
6108 if (X86ISD::VSHLI == Opcode) {
6109 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6110 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6111 Mask[i + j] = i + j - ByteShift;
6113 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6114 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6115 Mask[i + j - ByteShift] = i + j;
6119 case ISD::ZERO_EXTEND_VECTOR_INREG:
6120 case X86ISD::VZEXT: {
6121 // TODO - add support for VPMOVZX with smaller input vector types.
6122 SDValue Src = N.getOperand(0);
6123 MVT SrcVT = Src.getSimpleValueType();
6124 if (NumSizeInBits != SrcVT.getSizeInBits())
6126 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6135 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6136 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6137 SmallVectorImpl<int> &Mask) {
6138 int MaskWidth = Mask.size();
6139 SmallVector<SDValue, 16> UsedInputs;
6140 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6141 int lo = UsedInputs.size() * MaskWidth;
6142 int hi = lo + MaskWidth;
6144 // Strip UNDEF input usage.
6145 if (Inputs[i].isUndef())
6147 if ((lo <= M) && (M < hi))
6148 M = SM_SentinelUndef;
6150 // Check for unused inputs.
6151 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6152 UsedInputs.push_back(Inputs[i]);
6159 Inputs = UsedInputs;
6162 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6163 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6164 /// remaining input indices in case we now have a unary shuffle and adjust the
6165 /// inputs accordingly.
6166 /// Returns true if the target shuffle mask was decoded.
6167 static bool resolveTargetShuffleInputs(SDValue Op,
6168 SmallVectorImpl<SDValue> &Inputs,
6169 SmallVectorImpl<int> &Mask,
6170 SelectionDAG &DAG) {
6171 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6172 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6175 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6179 /// Returns the scalar element that will make up the ith
6180 /// element of the result of the vector shuffle.
6181 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6184 return SDValue(); // Limit search depth.
6186 SDValue V = SDValue(N, 0);
6187 EVT VT = V.getValueType();
6188 unsigned Opcode = V.getOpcode();
6190 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6191 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6192 int Elt = SV->getMaskElt(Index);
6195 return DAG.getUNDEF(VT.getVectorElementType());
6197 unsigned NumElems = VT.getVectorNumElements();
6198 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6199 : SV->getOperand(1);
6200 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6203 // Recurse into target specific vector shuffles to find scalars.
6204 if (isTargetShuffle(Opcode)) {
6205 MVT ShufVT = V.getSimpleValueType();
6206 MVT ShufSVT = ShufVT.getVectorElementType();
6207 int NumElems = (int)ShufVT.getVectorNumElements();
6208 SmallVector<int, 16> ShuffleMask;
6209 SmallVector<SDValue, 16> ShuffleOps;
6212 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6215 int Elt = ShuffleMask[Index];
6216 if (Elt == SM_SentinelZero)
6217 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6218 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6219 if (Elt == SM_SentinelUndef)
6220 return DAG.getUNDEF(ShufSVT);
6222 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6223 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6224 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6228 // Actual nodes that may contain scalar elements
6229 if (Opcode == ISD::BITCAST) {
6230 V = V.getOperand(0);
6231 EVT SrcVT = V.getValueType();
6232 unsigned NumElems = VT.getVectorNumElements();
6234 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6238 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6239 return (Index == 0) ? V.getOperand(0)
6240 : DAG.getUNDEF(VT.getVectorElementType());
6242 if (V.getOpcode() == ISD::BUILD_VECTOR)
6243 return V.getOperand(Index);
6248 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6249 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6250 unsigned NumNonZero, unsigned NumZero,
6252 const X86Subtarget &Subtarget) {
6253 MVT VT = Op.getSimpleValueType();
6254 unsigned NumElts = VT.getVectorNumElements();
6255 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6256 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6257 "Illegal vector insertion");
6263 for (unsigned i = 0; i < NumElts; ++i) {
6264 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6268 // If the build vector contains zeros or our first insertion is not the
6269 // first index then insert into zero vector to break any register
6270 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6273 if (NumZero || 0 != i)
6274 V = getZeroVector(VT, Subtarget, DAG, dl);
6276 assert(0 == i && "Expected insertion into zero-index");
6277 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6278 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6279 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6280 V = DAG.getBitcast(VT, V);
6284 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6285 DAG.getIntPtrConstant(i, dl));
6291 /// Custom lower build_vector of v16i8.
6292 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6293 unsigned NumNonZero, unsigned NumZero,
6295 const X86Subtarget &Subtarget) {
6296 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6299 // SSE4.1 - use PINSRB to insert each byte directly.
6300 if (Subtarget.hasSSE41())
6301 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6308 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6309 for (unsigned i = 0; i < 16; ++i) {
6310 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6311 if (ThisIsNonZero && First) {
6313 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6315 V = DAG.getUNDEF(MVT::v8i16);
6320 // FIXME: Investigate extending to i32 instead of just i16.
6321 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6322 SDValue ThisElt, LastElt;
6323 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6324 if (LastIsNonZero) {
6326 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6328 if (ThisIsNonZero) {
6329 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6330 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6331 DAG.getConstant(8, dl, MVT::i8));
6333 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6339 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6340 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6341 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6342 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6343 V = DAG.getBitcast(MVT::v8i16, V);
6345 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6346 DAG.getIntPtrConstant(i / 2, dl));
6352 return DAG.getBitcast(MVT::v16i8, V);
6355 /// Custom lower build_vector of v8i16.
6356 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6357 unsigned NumNonZero, unsigned NumZero,
6359 const X86Subtarget &Subtarget) {
6360 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6363 // Use PINSRW to insert each byte directly.
6364 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6368 /// Custom lower build_vector of v4i32 or v4f32.
6369 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6370 const X86Subtarget &Subtarget) {
6371 // Find all zeroable elements.
6372 std::bitset<4> Zeroable;
6373 for (int i=0; i < 4; ++i) {
6374 SDValue Elt = Op->getOperand(i);
6375 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6377 assert(Zeroable.size() - Zeroable.count() > 1 &&
6378 "We expect at least two non-zero elements!");
6380 // We only know how to deal with build_vector nodes where elements are either
6381 // zeroable or extract_vector_elt with constant index.
6382 SDValue FirstNonZero;
6383 unsigned FirstNonZeroIdx;
6384 for (unsigned i=0; i < 4; ++i) {
6387 SDValue Elt = Op->getOperand(i);
6388 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6389 !isa<ConstantSDNode>(Elt.getOperand(1)))
6391 // Make sure that this node is extracting from a 128-bit vector.
6392 MVT VT = Elt.getOperand(0).getSimpleValueType();
6393 if (!VT.is128BitVector())
6395 if (!FirstNonZero.getNode()) {
6397 FirstNonZeroIdx = i;
6401 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6402 SDValue V1 = FirstNonZero.getOperand(0);
6403 MVT VT = V1.getSimpleValueType();
6405 // See if this build_vector can be lowered as a blend with zero.
6407 unsigned EltMaskIdx, EltIdx;
6409 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6410 if (Zeroable[EltIdx]) {
6411 // The zero vector will be on the right hand side.
6412 Mask[EltIdx] = EltIdx+4;
6416 Elt = Op->getOperand(EltIdx);
6417 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6418 EltMaskIdx = Elt.getConstantOperandVal(1);
6419 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6421 Mask[EltIdx] = EltIdx;
6425 // Let the shuffle legalizer deal with blend operations.
6426 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6427 if (V1.getSimpleValueType() != VT)
6428 V1 = DAG.getBitcast(VT, V1);
6429 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6432 // See if we can lower this build_vector to a INSERTPS.
6433 if (!Subtarget.hasSSE41())
6436 SDValue V2 = Elt.getOperand(0);
6437 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6440 bool CanFold = true;
6441 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6445 SDValue Current = Op->getOperand(i);
6446 SDValue SrcVector = Current->getOperand(0);
6449 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6455 assert(V1.getNode() && "Expected at least two non-zero elements!");
6456 if (V1.getSimpleValueType() != MVT::v4f32)
6457 V1 = DAG.getBitcast(MVT::v4f32, V1);
6458 if (V2.getSimpleValueType() != MVT::v4f32)
6459 V2 = DAG.getBitcast(MVT::v4f32, V2);
6461 // Ok, we can emit an INSERTPS instruction.
6462 unsigned ZMask = Zeroable.to_ulong();
6464 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6465 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6467 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6468 DAG.getIntPtrConstant(InsertPSMask, DL));
6469 return DAG.getBitcast(VT, Result);
6472 /// Return a vector logical shift node.
6473 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6474 SelectionDAG &DAG, const TargetLowering &TLI,
6476 assert(VT.is128BitVector() && "Unknown type for VShift");
6477 MVT ShVT = MVT::v16i8;
6478 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6479 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6480 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6481 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6482 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6483 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6486 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6487 SelectionDAG &DAG) {
6489 // Check if the scalar load can be widened into a vector load. And if
6490 // the address is "base + cst" see if the cst can be "absorbed" into
6491 // the shuffle mask.
6492 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6493 SDValue Ptr = LD->getBasePtr();
6494 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6496 EVT PVT = LD->getValueType(0);
6497 if (PVT != MVT::i32 && PVT != MVT::f32)
6502 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6503 FI = FINode->getIndex();
6505 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6506 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6507 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6508 Offset = Ptr.getConstantOperandVal(1);
6509 Ptr = Ptr.getOperand(0);
6514 // FIXME: 256-bit vector instructions don't require a strict alignment,
6515 // improve this code to support it better.
6516 unsigned RequiredAlign = VT.getSizeInBits()/8;
6517 SDValue Chain = LD->getChain();
6518 // Make sure the stack object alignment is at least 16 or 32.
6519 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6520 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6521 if (MFI.isFixedObjectIndex(FI)) {
6522 // Can't change the alignment. FIXME: It's possible to compute
6523 // the exact stack offset and reference FI + adjust offset instead.
6524 // If someone *really* cares about this. That's the way to implement it.
6527 MFI.setObjectAlignment(FI, RequiredAlign);
6531 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6532 // Ptr + (Offset & ~15).
6535 if ((Offset % RequiredAlign) & 3)
6537 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6540 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6541 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6544 int EltNo = (Offset - StartOffset) >> 2;
6545 unsigned NumElems = VT.getVectorNumElements();
6547 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6548 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6549 LD->getPointerInfo().getWithOffset(StartOffset));
6551 SmallVector<int, 8> Mask(NumElems, EltNo);
6553 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6559 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6560 /// elements can be replaced by a single large load which has the same value as
6561 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6563 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6564 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6565 const SDLoc &DL, SelectionDAG &DAG,
6566 const X86Subtarget &Subtarget,
6567 bool isAfterLegalize) {
6568 unsigned NumElems = Elts.size();
6570 int LastLoadedElt = -1;
6571 SmallBitVector LoadMask(NumElems, false);
6572 SmallBitVector ZeroMask(NumElems, false);
6573 SmallBitVector UndefMask(NumElems, false);
6575 // For each element in the initializer, see if we've found a load, zero or an
6577 for (unsigned i = 0; i < NumElems; ++i) {
6578 SDValue Elt = peekThroughBitcasts(Elts[i]);
6583 UndefMask[i] = true;
6584 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6586 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6589 // Each loaded element must be the correct fractional portion of the
6590 // requested vector load.
6591 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6596 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6597 "Incomplete element masks");
6599 // Handle Special Cases - all undef or undef/zero.
6600 if (UndefMask.count() == NumElems)
6601 return DAG.getUNDEF(VT);
6603 // FIXME: Should we return this as a BUILD_VECTOR instead?
6604 if ((ZeroMask | UndefMask).count() == NumElems)
6605 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6606 : DAG.getConstantFP(0.0, DL, VT);
6608 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6609 int FirstLoadedElt = LoadMask.find_first();
6610 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6611 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6612 EVT LDBaseVT = EltBase.getValueType();
6614 // Consecutive loads can contain UNDEFS but not ZERO elements.
6615 // Consecutive loads with UNDEFs and ZEROs elements require a
6616 // an additional shuffle stage to clear the ZERO elements.
6617 bool IsConsecutiveLoad = true;
6618 bool IsConsecutiveLoadWithZeros = true;
6619 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6621 SDValue Elt = peekThroughBitcasts(Elts[i]);
6622 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6623 if (!DAG.areNonVolatileConsecutiveLoads(
6624 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6625 i - FirstLoadedElt)) {
6626 IsConsecutiveLoad = false;
6627 IsConsecutiveLoadWithZeros = false;
6630 } else if (ZeroMask[i]) {
6631 IsConsecutiveLoad = false;
6635 SmallVector<LoadSDNode *, 8> Loads;
6636 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
6638 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
6640 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6641 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6642 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6643 "Cannot merge volatile loads.");
6645 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6646 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6647 for (auto *LD : Loads)
6648 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6652 // LOAD - all consecutive load/undefs (must start/end with a load).
6653 // If we have found an entire vector of loads and undefs, then return a large
6654 // load of the entire vector width starting at the base pointer.
6655 // If the vector contains zeros, then attempt to shuffle those elements.
6656 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6657 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6658 assert(LDBase && "Did not find base load for merging consecutive loads");
6659 EVT EltVT = LDBase->getValueType(0);
6660 // Ensure that the input vector size for the merged loads matches the
6661 // cumulative size of the input elements.
6662 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6665 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6668 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6669 // will lower to regular temporal loads and use the cache.
6670 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6671 VT.is256BitVector() && !Subtarget.hasInt256())
6674 if (IsConsecutiveLoad)
6675 return CreateLoad(VT, LDBase);
6677 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6678 // vector and a zero vector to clear out the zero elements.
6679 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6680 SmallVector<int, 4> ClearMask(NumElems, -1);
6681 for (unsigned i = 0; i < NumElems; ++i) {
6683 ClearMask[i] = i + NumElems;
6684 else if (LoadMask[i])
6687 SDValue V = CreateLoad(VT, LDBase);
6688 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6689 : DAG.getConstantFP(0.0, DL, VT);
6690 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6695 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6697 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6698 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6699 (LoadSize == 32 || LoadSize == 64) &&
6700 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6701 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6702 : MVT::getIntegerVT(LoadSize);
6703 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6704 if (TLI.isTypeLegal(VecVT)) {
6705 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6706 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6708 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6709 LDBase->getPointerInfo(),
6710 LDBase->getAlignment(),
6711 MachineMemOperand::MOLoad);
6712 for (auto *LD : Loads)
6713 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6714 return DAG.getBitcast(VT, ResNode);
6721 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6722 unsigned SplatBitSize, LLVMContext &C) {
6723 unsigned ScalarSize = VT.getScalarSizeInBits();
6724 unsigned NumElm = SplatBitSize / ScalarSize;
6726 SmallVector<Constant *, 32> ConstantVec;
6727 for (unsigned i = 0; i < NumElm; i++) {
6728 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6730 if (VT.isFloatingPoint()) {
6731 if (ScalarSize == 32) {
6732 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6734 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6735 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6738 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6739 ConstantVec.push_back(Const);
6741 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6744 static bool isUseOfShuffle(SDNode *N) {
6745 for (auto *U : N->uses()) {
6746 if (isTargetShuffle(U->getOpcode()))
6748 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6749 return isUseOfShuffle(U);
6754 // Check if the current node of build vector is a zero extended vector.
6755 // // If so, return the value extended.
6756 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
6757 // // NumElt - return the number of zero extended identical values.
6758 // // EltType - return the type of the value include the zero extend.
6759 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
6760 unsigned &NumElt, MVT &EltType) {
6761 SDValue ExtValue = Op->getOperand(0);
6762 unsigned NumElts = Op->getNumOperands();
6763 unsigned Delta = NumElts;
6765 for (unsigned i = 1; i < NumElts; i++) {
6766 if (Op->getOperand(i) == ExtValue) {
6770 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
6773 if (!isPowerOf2_32(Delta) || Delta == 1)
6776 for (unsigned i = Delta; i < NumElts; i++) {
6777 if (i % Delta == 0) {
6778 if (Op->getOperand(i) != ExtValue)
6780 } else if (!(isNullConstant(Op->getOperand(i)) ||
6781 Op->getOperand(i).isUndef()))
6784 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
6785 unsigned ExtVTSize = EltSize * Delta;
6786 EltType = MVT::getIntegerVT(ExtVTSize);
6787 NumElt = NumElts / Delta;
6791 /// Attempt to use the vbroadcast instruction to generate a splat value
6792 /// from a splat BUILD_VECTOR which uses:
6793 /// a. A single scalar load, or a constant.
6794 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6796 /// The VBROADCAST node is returned when a pattern is found,
6797 /// or SDValue() otherwise.
6798 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6799 const X86Subtarget &Subtarget,
6800 SelectionDAG &DAG) {
6801 // VBROADCAST requires AVX.
6802 // TODO: Splats could be generated for non-AVX CPUs using SSE
6803 // instructions, but there's less potential gain for only 128-bit vectors.
6804 if (!Subtarget.hasAVX())
6807 MVT VT = BVOp->getSimpleValueType(0);
6810 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6811 "Unsupported vector type for broadcast.");
6813 BitVector UndefElements;
6814 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6816 // Attempt to use VBROADCASTM
6817 // From this paterrn:
6818 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
6819 // b. t1 = (build_vector t0 t0)
6821 // Create (VBROADCASTM v2i1 X)
6822 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
6823 MVT EltType = VT.getScalarType();
6824 unsigned NumElts = VT.getVectorNumElements();
6826 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
6827 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
6828 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
6829 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
6831 BOperand = ZeroExtended.getOperand(0);
6833 BOperand = Ld.getOperand(0).getOperand(0);
6834 if (BOperand.getValueType().isVector() &&
6835 BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
6836 if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
6837 NumElts == 8)) || // for broadcastmb2q
6838 (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
6839 NumElts == 16))) { // for broadcastmw2d
6841 DAG.getNode(X86ISD::VBROADCASTM, dl,
6842 MVT::getVectorVT(EltType, NumElts), BOperand);
6843 return DAG.getBitcast(VT, Brdcst);
6849 // We need a splat of a single value to use broadcast, and it doesn't
6850 // make any sense if the value is only in one element of the vector.
6851 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6852 APInt SplatValue, Undef;
6853 unsigned SplatBitSize;
6855 // Check if this is a repeated constant pattern suitable for broadcasting.
6856 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6857 SplatBitSize > VT.getScalarSizeInBits() &&
6858 SplatBitSize < VT.getSizeInBits()) {
6859 // Avoid replacing with broadcast when it's a use of a shuffle
6860 // instruction to preserve the present custom lowering of shuffles.
6861 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6863 // replace BUILD_VECTOR with broadcast of the repeated constants.
6864 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6865 LLVMContext *Ctx = DAG.getContext();
6866 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6867 if (Subtarget.hasAVX()) {
6868 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6869 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6870 // Splatted value can fit in one INTEGER constant in constant pool.
6871 // Load the constant and broadcast it.
6872 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6873 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6874 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6875 SDValue CP = DAG.getConstantPool(C, PVT);
6876 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6878 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6880 CVT, dl, DAG.getEntryNode(), CP,
6881 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6883 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6884 MVT::getVectorVT(CVT, Repeat), Ld);
6885 return DAG.getBitcast(VT, Brdcst);
6886 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6887 // Splatted value can fit in one FLOAT constant in constant pool.
6888 // Load the constant and broadcast it.
6889 // AVX have support for 32 and 64 bit broadcast for floats only.
6890 // No 64bit integer in 32bit subtarget.
6891 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6892 // Lower the splat via APFloat directly, to avoid any conversion.
6895 ? ConstantFP::get(*Ctx,
6896 APFloat(APFloat::IEEEsingle(), SplatValue))
6897 : ConstantFP::get(*Ctx,
6898 APFloat(APFloat::IEEEdouble(), SplatValue));
6899 SDValue CP = DAG.getConstantPool(C, PVT);
6900 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6902 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6904 CVT, dl, DAG.getEntryNode(), CP,
6905 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6907 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6908 MVT::getVectorVT(CVT, Repeat), Ld);
6909 return DAG.getBitcast(VT, Brdcst);
6910 } else if (SplatBitSize > 64) {
6911 // Load the vector of constants and broadcast it.
6912 MVT CVT = VT.getScalarType();
6913 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6915 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6916 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6917 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6919 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6920 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6922 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6923 return DAG.getBitcast(VT, Brdcst);
6930 bool ConstSplatVal =
6931 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6933 // Make sure that all of the users of a non-constant load are from the
6934 // BUILD_VECTOR node.
6935 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6938 unsigned ScalarSize = Ld.getValueSizeInBits();
6939 bool IsGE256 = (VT.getSizeInBits() >= 256);
6941 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6942 // instruction to save 8 or more bytes of constant pool data.
6943 // TODO: If multiple splats are generated to load the same constant,
6944 // it may be detrimental to overall size. There needs to be a way to detect
6945 // that condition to know if this is truly a size win.
6946 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
6948 // Handle broadcasting a single constant scalar from the constant pool
6950 // On Sandybridge (no AVX2), it is still better to load a constant vector
6951 // from the constant pool and not to broadcast it from a scalar.
6952 // But override that restriction when optimizing for size.
6953 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6954 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6955 EVT CVT = Ld.getValueType();
6956 assert(!CVT.isVector() && "Must not broadcast a vector type");
6958 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6959 // For size optimization, also splat v2f64 and v2i64, and for size opt
6960 // with AVX2, also splat i8 and i16.
6961 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6962 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6963 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6964 const Constant *C = nullptr;
6965 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6966 C = CI->getConstantIntValue();
6967 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6968 C = CF->getConstantFPValue();
6970 assert(C && "Invalid constant type");
6972 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6974 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6975 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6977 CVT, dl, DAG.getEntryNode(), CP,
6978 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6981 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6985 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6987 // Handle AVX2 in-register broadcasts.
6988 if (!IsLoad && Subtarget.hasInt256() &&
6989 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6990 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6992 // The scalar source must be a normal load.
6996 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6997 (Subtarget.hasVLX() && ScalarSize == 64))
6998 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7000 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7001 // double since there is no vbroadcastsd xmm
7002 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
7003 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
7004 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7007 // Unsupported broadcast.
7011 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
7012 /// underlying vector and index.
7014 /// Modifies \p ExtractedFromVec to the real vector and returns the real
7016 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7018 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7019 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7022 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7024 // (extract_vector_elt (v8f32 %1), Constant<6>)
7026 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7027 // (extract_subvector (v8f32 %0), Constant<4>),
7030 // In this case the vector is the extract_subvector expression and the index
7031 // is 2, as specified by the shuffle.
7032 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7033 SDValue ShuffleVec = SVOp->getOperand(0);
7034 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7035 assert(ShuffleVecVT.getVectorElementType() ==
7036 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7038 int ShuffleIdx = SVOp->getMaskElt(Idx);
7039 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7040 ExtractedFromVec = ShuffleVec;
7046 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7047 MVT VT = Op.getSimpleValueType();
7049 // Skip if insert_vec_elt is not supported.
7050 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7051 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7055 unsigned NumElems = Op.getNumOperands();
7059 SmallVector<unsigned, 4> InsertIndices;
7060 SmallVector<int, 8> Mask(NumElems, -1);
7062 for (unsigned i = 0; i != NumElems; ++i) {
7063 unsigned Opc = Op.getOperand(i).getOpcode();
7065 if (Opc == ISD::UNDEF)
7068 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7069 // Quit if more than 1 elements need inserting.
7070 if (InsertIndices.size() > 1)
7073 InsertIndices.push_back(i);
7077 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7078 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7080 // Quit if non-constant index.
7081 if (!isa<ConstantSDNode>(ExtIdx))
7083 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7085 // Quit if extracted from vector of different type.
7086 if (ExtractedFromVec.getValueType() != VT)
7089 if (!VecIn1.getNode())
7090 VecIn1 = ExtractedFromVec;
7091 else if (VecIn1 != ExtractedFromVec) {
7092 if (!VecIn2.getNode())
7093 VecIn2 = ExtractedFromVec;
7094 else if (VecIn2 != ExtractedFromVec)
7095 // Quit if more than 2 vectors to shuffle
7099 if (ExtractedFromVec == VecIn1)
7101 else if (ExtractedFromVec == VecIn2)
7102 Mask[i] = Idx + NumElems;
7105 if (!VecIn1.getNode())
7108 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7109 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7111 for (unsigned Idx : InsertIndices)
7112 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7113 DAG.getIntPtrConstant(Idx, DL));
7118 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7119 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7120 Op.getScalarValueSizeInBits() == 1 &&
7121 "Can not convert non-constant vector");
7122 uint64_t Immediate = 0;
7123 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7124 SDValue In = Op.getOperand(idx);
7126 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7129 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7130 return DAG.getConstant(Immediate, dl, VT);
7132 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7133 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
7134 const X86Subtarget &Subtarget) {
7136 MVT VT = Op.getSimpleValueType();
7137 assert((VT.getVectorElementType() == MVT::i1) &&
7138 "Unexpected type in LowerBUILD_VECTORvXi1!");
7141 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7144 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7147 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7148 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7149 // Split the pieces.
7151 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7153 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7154 // We have to manually lower both halves so getNode doesn't try to
7155 // reassemble the build_vector.
7156 Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
7157 Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
7158 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7160 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7161 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7162 return DAG.getBitcast(VT, Imm);
7163 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7164 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7165 DAG.getIntPtrConstant(0, dl));
7168 // Vector has one or more non-const elements
7169 uint64_t Immediate = 0;
7170 SmallVector<unsigned, 16> NonConstIdx;
7171 bool IsSplat = true;
7172 bool HasConstElts = false;
7174 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7175 SDValue In = Op.getOperand(idx);
7178 if (!isa<ConstantSDNode>(In))
7179 NonConstIdx.push_back(idx);
7181 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7182 HasConstElts = true;
7186 else if (In != Op.getOperand(SplatIdx))
7190 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7192 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7193 DAG.getConstant(1, dl, VT),
7194 DAG.getConstant(0, dl, VT));
7196 // insert elements one by one
7200 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7201 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7203 else if (HasConstElts)
7204 Imm = DAG.getConstant(0, dl, VT);
7206 Imm = DAG.getUNDEF(VT);
7207 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7208 DstVec = DAG.getBitcast(VT, Imm);
7210 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7211 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7212 DAG.getIntPtrConstant(0, dl));
7215 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7216 unsigned InsertIdx = NonConstIdx[i];
7217 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7218 Op.getOperand(InsertIdx),
7219 DAG.getIntPtrConstant(InsertIdx, dl));
7224 /// \brief Return true if \p N implements a horizontal binop and return the
7225 /// operands for the horizontal binop into V0 and V1.
7227 /// This is a helper function of LowerToHorizontalOp().
7228 /// This function checks that the build_vector \p N in input implements a
7229 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7230 /// operation to match.
7231 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7232 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7233 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7236 /// This function only analyzes elements of \p N whose indices are
7237 /// in range [BaseIdx, LastIdx).
7238 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7240 unsigned BaseIdx, unsigned LastIdx,
7241 SDValue &V0, SDValue &V1) {
7242 EVT VT = N->getValueType(0);
7244 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7245 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7246 "Invalid Vector in input!");
7248 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7249 bool CanFold = true;
7250 unsigned ExpectedVExtractIdx = BaseIdx;
7251 unsigned NumElts = LastIdx - BaseIdx;
7252 V0 = DAG.getUNDEF(VT);
7253 V1 = DAG.getUNDEF(VT);
7255 // Check if N implements a horizontal binop.
7256 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7257 SDValue Op = N->getOperand(i + BaseIdx);
7260 if (Op->isUndef()) {
7261 // Update the expected vector extract index.
7262 if (i * 2 == NumElts)
7263 ExpectedVExtractIdx = BaseIdx;
7264 ExpectedVExtractIdx += 2;
7268 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7273 SDValue Op0 = Op.getOperand(0);
7274 SDValue Op1 = Op.getOperand(1);
7276 // Try to match the following pattern:
7277 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7278 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7279 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7280 Op0.getOperand(0) == Op1.getOperand(0) &&
7281 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7282 isa<ConstantSDNode>(Op1.getOperand(1)));
7286 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7287 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7289 if (i * 2 < NumElts) {
7291 V0 = Op0.getOperand(0);
7292 if (V0.getValueType() != VT)
7297 V1 = Op0.getOperand(0);
7298 if (V1.getValueType() != VT)
7301 if (i * 2 == NumElts)
7302 ExpectedVExtractIdx = BaseIdx;
7305 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7306 if (I0 == ExpectedVExtractIdx)
7307 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7308 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7309 // Try to match the following dag sequence:
7310 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7311 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7315 ExpectedVExtractIdx += 2;
7321 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7322 /// a concat_vector.
7324 /// This is a helper function of LowerToHorizontalOp().
7325 /// This function expects two 256-bit vectors called V0 and V1.
7326 /// At first, each vector is split into two separate 128-bit vectors.
7327 /// Then, the resulting 128-bit vectors are used to implement two
7328 /// horizontal binary operations.
7330 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7332 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7333 /// the two new horizontal binop.
7334 /// When Mode is set, the first horizontal binop dag node would take as input
7335 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7336 /// horizontal binop dag node would take as input the lower 128-bit of V1
7337 /// and the upper 128-bit of V1.
7339 /// HADD V0_LO, V0_HI
7340 /// HADD V1_LO, V1_HI
7342 /// Otherwise, the first horizontal binop dag node takes as input the lower
7343 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7344 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7346 /// HADD V0_LO, V1_LO
7347 /// HADD V0_HI, V1_HI
7349 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7350 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7351 /// the upper 128-bits of the result.
7352 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7353 const SDLoc &DL, SelectionDAG &DAG,
7354 unsigned X86Opcode, bool Mode,
7355 bool isUndefLO, bool isUndefHI) {
7356 MVT VT = V0.getSimpleValueType();
7357 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7358 "Invalid nodes in input!");
7360 unsigned NumElts = VT.getVectorNumElements();
7361 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7362 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7363 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7364 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7365 MVT NewVT = V0_LO.getSimpleValueType();
7367 SDValue LO = DAG.getUNDEF(NewVT);
7368 SDValue HI = DAG.getUNDEF(NewVT);
7371 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7372 if (!isUndefLO && !V0->isUndef())
7373 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7374 if (!isUndefHI && !V1->isUndef())
7375 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7377 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7378 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7379 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7381 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7382 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7385 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7388 /// Returns true iff \p BV builds a vector with the result equivalent to
7389 /// the result of ADDSUB operation.
7390 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7391 /// are written to the parameters \p Opnd0 and \p Opnd1.
7392 static bool isAddSub(const BuildVectorSDNode *BV,
7393 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7394 SDValue &Opnd0, SDValue &Opnd1,
7395 unsigned &NumExtracts) {
7397 MVT VT = BV->getSimpleValueType(0);
7398 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7399 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7400 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7403 unsigned NumElts = VT.getVectorNumElements();
7404 SDValue InVec0 = DAG.getUNDEF(VT);
7405 SDValue InVec1 = DAG.getUNDEF(VT);
7409 // Odd-numbered elements in the input build vector are obtained from
7410 // adding two integer/float elements.
7411 // Even-numbered elements in the input build vector are obtained from
7412 // subtracting two integer/float elements.
7413 unsigned ExpectedOpcode = ISD::FSUB;
7414 unsigned NextExpectedOpcode = ISD::FADD;
7415 bool AddFound = false;
7416 bool SubFound = false;
7418 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7419 SDValue Op = BV->getOperand(i);
7421 // Skip 'undef' values.
7422 unsigned Opcode = Op.getOpcode();
7423 if (Opcode == ISD::UNDEF) {
7424 std::swap(ExpectedOpcode, NextExpectedOpcode);
7428 // Early exit if we found an unexpected opcode.
7429 if (Opcode != ExpectedOpcode)
7432 SDValue Op0 = Op.getOperand(0);
7433 SDValue Op1 = Op.getOperand(1);
7435 // Try to match the following pattern:
7436 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7437 // Early exit if we cannot match that sequence.
7438 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7439 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7440 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7441 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7442 Op0.getOperand(1) != Op1.getOperand(1))
7445 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7449 // We found a valid add/sub node. Update the information accordingly.
7455 // Update InVec0 and InVec1.
7456 if (InVec0.isUndef()) {
7457 InVec0 = Op0.getOperand(0);
7458 if (InVec0.getSimpleValueType() != VT)
7461 if (InVec1.isUndef()) {
7462 InVec1 = Op1.getOperand(0);
7463 if (InVec1.getSimpleValueType() != VT)
7467 // Make sure that operands in input to each add/sub node always
7468 // come from a same pair of vectors.
7469 if (InVec0 != Op0.getOperand(0)) {
7470 if (ExpectedOpcode == ISD::FSUB)
7473 // FADD is commutable. Try to commute the operands
7474 // and then test again.
7475 std::swap(Op0, Op1);
7476 if (InVec0 != Op0.getOperand(0))
7480 if (InVec1 != Op1.getOperand(0))
7483 // Update the pair of expected opcodes.
7484 std::swap(ExpectedOpcode, NextExpectedOpcode);
7486 // Increment the number of extractions done.
7490 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7491 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7499 /// Returns true if is possible to fold MUL and an idiom that has already been
7500 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7501 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7502 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7504 /// Prior to calling this function it should be known that there is some
7505 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7506 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7507 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7508 /// of \p Opnd0 uses is expected to be equal to 2.
7509 /// For example, this function may be called for the following IR:
7510 /// %AB = fmul fast <2 x double> %A, %B
7511 /// %Sub = fsub fast <2 x double> %AB, %C
7512 /// %Add = fadd fast <2 x double> %AB, %C
7513 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7514 /// <2 x i32> <i32 0, i32 3>
7515 /// There is a def for %Addsub here, which potentially can be replaced by
7516 /// X86ISD::ADDSUB operation:
7517 /// %Addsub = X86ISD::ADDSUB %AB, %C
7518 /// and such ADDSUB can further be replaced with FMADDSUB:
7519 /// %Addsub = FMADDSUB %A, %B, %C.
7521 /// The main reason why this method is called before the replacement of the
7522 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7523 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7525 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
7527 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7528 unsigned ExpectedUses) {
7529 if (Opnd0.getOpcode() != ISD::FMUL ||
7530 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
7533 // FIXME: These checks must match the similar ones in
7534 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7535 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7536 // or MUL + ADDSUB to FMADDSUB.
7537 const TargetOptions &Options = DAG.getTarget().Options;
7539 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7544 Opnd1 = Opnd0.getOperand(1);
7545 Opnd0 = Opnd0.getOperand(0);
7550 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7551 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7552 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7553 const X86Subtarget &Subtarget,
7554 SelectionDAG &DAG) {
7555 SDValue Opnd0, Opnd1;
7556 unsigned NumExtracts;
7557 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts))
7560 MVT VT = BV->getSimpleValueType(0);
7563 // Try to generate X86ISD::FMADDSUB node here.
7565 // TODO: According to coverage reports, the FMADDSUB transform is not
7566 // triggered by any tests.
7567 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
7568 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7570 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7571 // the ADDSUB idiom has been successfully recognized. There are no known
7572 // X86 targets with 512-bit ADDSUB instructions!
7573 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7575 if (VT.is512BitVector())
7578 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7581 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7582 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7583 const X86Subtarget &Subtarget,
7584 SelectionDAG &DAG) {
7585 MVT VT = BV->getSimpleValueType(0);
7586 unsigned NumElts = VT.getVectorNumElements();
7587 unsigned NumUndefsLO = 0;
7588 unsigned NumUndefsHI = 0;
7589 unsigned Half = NumElts/2;
7591 // Count the number of UNDEF operands in the build_vector in input.
7592 for (unsigned i = 0, e = Half; i != e; ++i)
7593 if (BV->getOperand(i)->isUndef())
7596 for (unsigned i = Half, e = NumElts; i != e; ++i)
7597 if (BV->getOperand(i)->isUndef())
7600 // Early exit if this is either a build_vector of all UNDEFs or all the
7601 // operands but one are UNDEF.
7602 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7606 SDValue InVec0, InVec1;
7607 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7608 // Try to match an SSE3 float HADD/HSUB.
7609 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7610 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7612 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7613 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7614 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7615 // Try to match an SSSE3 integer HADD/HSUB.
7616 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7617 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7619 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7620 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7623 if (!Subtarget.hasAVX())
7626 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7627 // Try to match an AVX horizontal add/sub of packed single/double
7628 // precision floating point values from 256-bit vectors.
7629 SDValue InVec2, InVec3;
7630 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7631 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7632 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7633 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7634 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7636 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7637 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7638 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7639 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7640 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7641 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7642 // Try to match an AVX2 horizontal add/sub of signed integers.
7643 SDValue InVec2, InVec3;
7645 bool CanFold = true;
7647 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7648 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7649 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7650 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7651 X86Opcode = X86ISD::HADD;
7652 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7653 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7654 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7655 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7656 X86Opcode = X86ISD::HSUB;
7661 // Fold this build_vector into a single horizontal add/sub.
7662 // Do this only if the target has AVX2.
7663 if (Subtarget.hasAVX2())
7664 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7666 // Do not try to expand this build_vector into a pair of horizontal
7667 // add/sub if we can emit a pair of scalar add/sub.
7668 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7671 // Convert this build_vector into a pair of horizontal binop followed by
7673 bool isUndefLO = NumUndefsLO == Half;
7674 bool isUndefHI = NumUndefsHI == Half;
7675 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7676 isUndefLO, isUndefHI);
7680 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7681 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7683 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7684 X86Opcode = X86ISD::HADD;
7685 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7686 X86Opcode = X86ISD::HSUB;
7687 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7688 X86Opcode = X86ISD::FHADD;
7689 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7690 X86Opcode = X86ISD::FHSUB;
7694 // Don't try to expand this build_vector into a pair of horizontal add/sub
7695 // if we can simply emit a pair of scalar add/sub.
7696 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7699 // Convert this build_vector into two horizontal add/sub followed by
7701 bool isUndefLO = NumUndefsLO == Half;
7702 bool isUndefHI = NumUndefsHI == Half;
7703 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7704 isUndefLO, isUndefHI);
7710 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7711 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7712 /// just apply the bit to the vectors.
7713 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7714 /// from this, but enough scalar bit operations are created from the later
7715 /// legalization + scalarization stages to need basic support.
7716 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7717 SelectionDAG &DAG) {
7719 MVT VT = Op->getSimpleValueType(0);
7720 unsigned NumElems = VT.getVectorNumElements();
7721 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7723 // Check that all elements have the same opcode.
7724 // TODO: Should we allow UNDEFS and if so how many?
7725 unsigned Opcode = Op->getOperand(0).getOpcode();
7726 for (unsigned i = 1; i < NumElems; ++i)
7727 if (Opcode != Op->getOperand(i).getOpcode())
7730 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7737 // Don't do this if the buildvector is a splat - we'd replace one
7738 // constant with an entire vector.
7739 if (Op->getSplatValue())
7741 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7746 SmallVector<SDValue, 4> LHSElts, RHSElts;
7747 for (SDValue Elt : Op->ops()) {
7748 SDValue LHS = Elt.getOperand(0);
7749 SDValue RHS = Elt.getOperand(1);
7751 // We expect the canonicalized RHS operand to be the constant.
7752 if (!isa<ConstantSDNode>(RHS))
7754 LHSElts.push_back(LHS);
7755 RHSElts.push_back(RHS);
7758 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7759 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7760 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7763 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7764 /// functionality to do this, so it's all zeros, all ones, or some derivation
7765 /// that is cheap to calculate.
7766 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7767 const X86Subtarget &Subtarget) {
7769 MVT VT = Op.getSimpleValueType();
7771 // Vectors containing all zeros can be matched by pxor and xorps.
7772 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7773 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7774 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7775 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7778 return getZeroVector(VT, Subtarget, DAG, DL);
7781 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7782 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7783 // vpcmpeqd on 256-bit vectors.
7784 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7785 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7786 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7789 return getOnesVector(VT, DAG, DL);
7795 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
7796 // reasoned to be a permutation of a vector by indices in a non-constant vector.
7797 // (build_vector (extract_elt V, (extract_elt I, 0)),
7798 // (extract_elt V, (extract_elt I, 1)),
7803 // TODO: Handle undefs
7804 // TODO: Utilize pshufb and zero mask blending to support more efficient
7805 // construction of vectors with constant-0 elements.
7806 // TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
7807 // when no native operation available.
7809 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
7810 const X86Subtarget &Subtarget) {
7811 // Look for VPERMV and PSHUFB opportunities.
7812 MVT VT = V.getSimpleValueType();
7813 switch (VT.SimpleTy) {
7817 if (!Subtarget.hasSSE3())
7822 if (!Subtarget.hasAVX2())
7827 if (!Subtarget.hasVLX())
7834 if (!Subtarget.hasAVX512())
7838 if (!Subtarget.hasBWI())
7843 if (!Subtarget.hasVLX() || !Subtarget.hasBWI())
7847 if (!Subtarget.hasVBMI())
7851 if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())
7855 SDValue SrcVec, IndicesVec;
7856 // Check for a match of the permute source vector and permute index elements.
7857 // This is done by checking that the i-th build_vector operand is of the form:
7858 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
7859 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
7860 SDValue Op = V.getOperand(Idx);
7861 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7864 // If this is the first extract encountered in V, set the source vector,
7865 // otherwise verify the extract is from the previously defined source
7868 SrcVec = Op.getOperand(0);
7869 else if (SrcVec != Op.getOperand(0))
7871 SDValue ExtractedIndex = Op->getOperand(1);
7872 // Peek through extends.
7873 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
7874 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
7875 ExtractedIndex = ExtractedIndex.getOperand(0);
7876 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7879 // If this is the first extract from the index vector candidate, set the
7880 // indices vector, otherwise verify the extract is from the previously
7881 // defined indices vector.
7883 IndicesVec = ExtractedIndex.getOperand(0);
7884 else if (IndicesVec != ExtractedIndex.getOperand(0))
7887 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
7888 if (!PermIdx || PermIdx->getZExtValue() != Idx)
7892 if (VT.isFloatingPoint())
7893 IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
7894 VT.getVectorNumElements());
7895 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
7896 if (SrcVec.getValueSizeInBits() < IndicesVT.getSizeInBits()) {
7898 DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(SrcVec), VT, DAG.getUNDEF(VT),
7899 SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec)));
7901 if (VT == MVT::v16i8)
7902 return DAG.getNode(X86ISD::PSHUFB, SDLoc(V), VT, SrcVec, IndicesVec);
7903 return DAG.getNode(X86ISD::VPERMV, SDLoc(V), VT, IndicesVec, SrcVec);
7907 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7910 MVT VT = Op.getSimpleValueType();
7911 MVT ExtVT = VT.getVectorElementType();
7912 unsigned NumElems = Op.getNumOperands();
7914 // Generate vectors for predicate vectors.
7915 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7916 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
7918 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7919 return VectorConstant;
7921 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7922 // TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB
7924 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7926 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7927 return HorizontalOp;
7928 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7930 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7933 unsigned EVTBits = ExtVT.getSizeInBits();
7935 unsigned NumZero = 0;
7936 unsigned NumNonZero = 0;
7937 uint64_t NonZeros = 0;
7938 bool IsAllConstants = true;
7939 SmallSet<SDValue, 8> Values;
7940 unsigned NumConstants = NumElems;
7941 for (unsigned i = 0; i < NumElems; ++i) {
7942 SDValue Elt = Op.getOperand(i);
7946 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
7947 IsAllConstants = false;
7950 if (X86::isZeroNode(Elt))
7953 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7954 NonZeros |= ((uint64_t)1 << i);
7959 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7960 if (NumNonZero == 0)
7961 return DAG.getUNDEF(VT);
7963 // If we are inserting one variable into a vector of non-zero constants, try
7964 // to avoid loading each constant element as a scalar. Load the constants as a
7965 // vector and then insert the variable scalar element. If insertion is not
7966 // supported, we assume that we will fall back to a shuffle to get the scalar
7967 // blended with the constants. Insertion into a zero vector is handled as a
7968 // special-case somewhere below here.
7969 LLVMContext &Context = *DAG.getContext();
7970 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
7971 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
7972 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
7973 // Create an all-constant vector. The variable element in the old
7974 // build vector is replaced by undef in the constant vector. Save the
7975 // variable scalar element and its index for use in the insertelement.
7976 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
7977 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
7980 for (unsigned i = 0; i != NumElems; ++i) {
7981 SDValue Elt = Op.getOperand(i);
7982 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
7983 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
7984 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
7985 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
7986 else if (!Elt.isUndef()) {
7987 assert(!VarElt.getNode() && !InsIndex.getNode() &&
7988 "Expected one variable element in this vector");
7990 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
7993 Constant *CV = ConstantVector::get(ConstVecOps);
7994 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
7996 // The constants we just created may not be legal (eg, floating point). We
7997 // must lower the vector right here because we can not guarantee that we'll
7998 // legalize it before loading it. This is also why we could not just create
7999 // a new build vector here. If the build vector contains illegal constants,
8000 // it could get split back up into a series of insert elements.
8001 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8002 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8003 MachineFunction &MF = DAG.getMachineFunction();
8004 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
8005 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8006 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8009 // Special case for single non-zero, non-undef, element.
8010 if (NumNonZero == 1) {
8011 unsigned Idx = countTrailingZeros(NonZeros);
8012 SDValue Item = Op.getOperand(Idx);
8014 // If this is an insertion of an i64 value on x86-32, and if the top bits of
8015 // the value are obviously zero, truncate the value to i32 and do the
8016 // insertion that way. Only do this if the value is non-constant or if the
8017 // value is a constant being inserted into element 0. It is cheaper to do
8018 // a constant pool load than it is to do a movd + shuffle.
8019 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
8020 (!IsAllConstants || Idx == 0)) {
8021 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
8023 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
8024 MVT VecVT = MVT::v4i32;
8026 // Truncate the value (which may itself be a constant) to i32, and
8027 // convert it to a vector with movd (S2V+shuffle to zero extend).
8028 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
8029 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
8030 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
8031 Item, Idx * 2, true, Subtarget, DAG));
8035 // If we have a constant or non-constant insertion into the low element of
8036 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8037 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8038 // depending on what the source datatype is.
8041 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8043 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
8044 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
8045 assert((VT.is128BitVector() || VT.is256BitVector() ||
8046 VT.is512BitVector()) &&
8047 "Expected an SSE value type!");
8048 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8049 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
8050 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8053 // We can't directly insert an i8 or i16 into a vector, so zero extend
8055 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
8056 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8057 if (VT.getSizeInBits() >= 256) {
8058 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
8059 if (Subtarget.hasAVX()) {
8060 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8061 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8063 // Without AVX, we need to extend to a 128-bit vector and then
8064 // insert into the 256-bit vector.
8065 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8066 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
8067 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
8070 assert(VT.is128BitVector() && "Expected an SSE value type!");
8071 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8072 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8074 return DAG.getBitcast(VT, Item);
8078 // Is it a vector logical left shift?
8079 if (NumElems == 2 && Idx == 1 &&
8080 X86::isZeroNode(Op.getOperand(0)) &&
8081 !X86::isZeroNode(Op.getOperand(1))) {
8082 unsigned NumBits = VT.getSizeInBits();
8083 return getVShift(true, VT,
8084 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8085 VT, Op.getOperand(1)),
8086 NumBits/2, DAG, *this, dl);
8089 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8092 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8093 // is a non-constant being inserted into an element other than the low one,
8094 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8095 // movd/movss) to move this into the low element, then shuffle it into
8097 if (EVTBits == 32) {
8098 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8099 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8103 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8104 if (Values.size() == 1) {
8105 if (EVTBits == 32) {
8106 // Instead of a shuffle like this:
8107 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8108 // Check if it's possible to issue this instead.
8109 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8110 unsigned Idx = countTrailingZeros(NonZeros);
8111 SDValue Item = Op.getOperand(Idx);
8112 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8113 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8118 // A vector full of immediates; various special cases are already
8119 // handled, so this is best done with a single constant-pool load.
8123 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8126 // See if we can use a vector load to get all of the elements.
8127 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
8128 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8130 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8134 // For AVX-length vectors, build the individual 128-bit pieces and use
8135 // shuffles to put them in place.
8136 if (VT.is256BitVector() || VT.is512BitVector()) {
8137 EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);
8139 // Build both the lower and upper subvector.
8141 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8142 SDValue Upper = DAG.getBuildVector(
8143 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8145 // Recreate the wider vector with the lower and upper part.
8146 if (VT.is256BitVector())
8147 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
8148 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
8151 // Let legalizer expand 2-wide build_vectors.
8152 if (EVTBits == 64) {
8153 if (NumNonZero == 1) {
8154 // One half is zero or undef.
8155 unsigned Idx = countTrailingZeros(NonZeros);
8156 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8157 Op.getOperand(Idx));
8158 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8163 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8164 if (EVTBits == 8 && NumElems == 16)
8165 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
8169 if (EVTBits == 16 && NumElems == 8)
8170 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
8174 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8175 if (EVTBits == 32 && NumElems == 4)
8176 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8179 // If element VT is == 32 bits, turn it into a number of shuffles.
8180 if (NumElems == 4 && NumZero > 0) {
8181 SmallVector<SDValue, 8> Ops(NumElems);
8182 for (unsigned i = 0; i < 4; ++i) {
8183 bool isZero = !(NonZeros & (1ULL << i));
8185 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8187 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8190 for (unsigned i = 0; i < 2; ++i) {
8191 switch ((NonZeros >> (i*2)) & 0x3) {
8192 default: llvm_unreachable("Unexpected NonZero count");
8194 Ops[i] = Ops[i*2]; // Must be a zero vector.
8197 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8200 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8203 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8208 bool Reverse1 = (NonZeros & 0x3) == 2;
8209 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
8213 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8214 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8216 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8219 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8221 // Check for a build vector from mostly shuffle plus few inserting.
8222 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8225 // For SSE 4.1, use insertps to put the high elements into the low element.
8226 if (Subtarget.hasSSE41()) {
8228 if (!Op.getOperand(0).isUndef())
8229 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8231 Result = DAG.getUNDEF(VT);
8233 for (unsigned i = 1; i < NumElems; ++i) {
8234 if (Op.getOperand(i).isUndef()) continue;
8235 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8236 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8241 // Otherwise, expand into a number of unpckl*, start by extending each of
8242 // our (non-undef) elements to the full vector width with the element in the
8243 // bottom slot of the vector (which generates no code for SSE).
8244 SmallVector<SDValue, 8> Ops(NumElems);
8245 for (unsigned i = 0; i < NumElems; ++i) {
8246 if (!Op.getOperand(i).isUndef())
8247 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8249 Ops[i] = DAG.getUNDEF(VT);
8252 // Next, we iteratively mix elements, e.g. for v4f32:
8253 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8254 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8255 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8256 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8257 // Generate scaled UNPCKL shuffle mask.
8258 SmallVector<int, 16> Mask;
8259 for(unsigned i = 0; i != Scale; ++i)
8261 for (unsigned i = 0; i != Scale; ++i)
8262 Mask.push_back(NumElems+i);
8263 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8265 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8266 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8271 // 256-bit AVX can use the vinsertf128 instruction
8272 // to create 256-bit vectors from two other 128-bit ones.
8273 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
8275 MVT ResVT = Op.getSimpleValueType();
8277 assert((ResVT.is256BitVector() ||
8278 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8280 SDValue V1 = Op.getOperand(0);
8281 SDValue V2 = Op.getOperand(1);
8282 unsigned NumElems = ResVT.getVectorNumElements();
8283 if (ResVT.is256BitVector())
8284 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8286 if (Op.getNumOperands() == 4) {
8287 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8288 ResVT.getVectorNumElements()/2);
8289 SDValue V3 = Op.getOperand(2);
8290 SDValue V4 = Op.getOperand(3);
8291 return concat256BitVectors(
8292 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
8293 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
8296 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8299 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8300 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8301 static bool isExpandWithZeros(const SDValue &Op) {
8302 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8303 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8305 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8306 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8312 // Returns true if the given node is a type promotion (by concatenating i1
8313 // zeros) of the result of a node that already zeros all upper bits of
8315 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8316 unsigned Opc = Op.getOpcode();
8318 assert(Opc == ISD::CONCAT_VECTORS &&
8319 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8320 "Unexpected node to check for type promotion!");
8322 // As long as we are concatenating zeros to the upper part of a previous node
8323 // result, climb up the tree until a node with different opcode is
8325 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8326 if (Opc == ISD::INSERT_SUBVECTOR) {
8327 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8328 Op.getConstantOperandVal(2) == 0)
8329 Op = Op.getOperand(1);
8332 } else { // Opc == ISD::CONCAT_VECTORS
8333 if (isExpandWithZeros(Op))
8334 Op = Op.getOperand(0);
8338 Opc = Op.getOpcode();
8341 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8342 // of a node that zeros the upper bits (its masked version).
8343 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8344 (Op.getOpcode() == ISD::AND &&
8345 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8346 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8353 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8354 const X86Subtarget &Subtarget,
8355 SelectionDAG & DAG) {
8357 MVT ResVT = Op.getSimpleValueType();
8358 unsigned NumOperands = Op.getNumOperands();
8360 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
8361 "Unexpected number of operands in CONCAT_VECTORS");
8363 // If this node promotes - by concatenating zeroes - the type of the result
8364 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8365 // output register, mark it as legal and catch the pattern in instruction
8366 // selection to avoid emitting extra instructions (for zeroing upper bits).
8367 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
8368 SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
8369 SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
8370 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
8374 unsigned NumZero = 0;
8375 unsigned NumNonZero = 0;
8376 uint64_t NonZeros = 0;
8377 for (unsigned i = 0; i != NumOperands; ++i) {
8378 SDValue SubVec = Op.getOperand(i);
8379 if (SubVec.isUndef())
8381 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8384 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8385 NonZeros |= (uint64_t)1 << i;
8391 // If there are zero or one non-zeros we can handle this very simply.
8392 if (NumNonZero <= 1) {
8393 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8394 : DAG.getUNDEF(ResVT);
8397 unsigned Idx = countTrailingZeros(NonZeros);
8398 SDValue SubVec = Op.getOperand(Idx);
8399 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
8400 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
8401 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
8404 if (NumOperands > 2) {
8405 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8406 ResVT.getVectorNumElements()/2);
8407 ArrayRef<SDUse> Ops = Op->ops();
8408 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8409 Ops.slice(0, NumOperands/2));
8410 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8411 Ops.slice(NumOperands/2));
8412 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8415 assert(NumNonZero == 2 && "Simple cases not handled?");
8417 if (ResVT.getVectorNumElements() >= 16)
8418 return Op; // The operation is legal with KUNPCK
8420 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
8421 DAG.getUNDEF(ResVT), Op.getOperand(0),
8422 DAG.getIntPtrConstant(0, dl));
8423 unsigned NumElems = ResVT.getVectorNumElements();
8424 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
8425 DAG.getIntPtrConstant(NumElems/2, dl));
8428 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8429 const X86Subtarget &Subtarget,
8430 SelectionDAG &DAG) {
8431 MVT VT = Op.getSimpleValueType();
8432 if (VT.getVectorElementType() == MVT::i1)
8433 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8435 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8436 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8437 Op.getNumOperands() == 4)));
8439 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8440 // from two other 128-bit ones.
8442 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8443 return LowerAVXCONCAT_VECTORS(Op, DAG);
8446 //===----------------------------------------------------------------------===//
8447 // Vector shuffle lowering
8449 // This is an experimental code path for lowering vector shuffles on x86. It is
8450 // designed to handle arbitrary vector shuffles and blends, gracefully
8451 // degrading performance as necessary. It works hard to recognize idiomatic
8452 // shuffles and lower them to optimal instruction patterns without leaving
8453 // a framework that allows reasonably efficient handling of all vector shuffle
8455 //===----------------------------------------------------------------------===//
8457 /// \brief Tiny helper function to identify a no-op mask.
8459 /// This is a somewhat boring predicate function. It checks whether the mask
8460 /// array input, which is assumed to be a single-input shuffle mask of the kind
8461 /// used by the X86 shuffle instructions (not a fully general
8462 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8463 /// in-place shuffle are 'no-op's.
8464 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8465 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8466 assert(Mask[i] >= -1 && "Out of bound mask element!");
8467 if (Mask[i] >= 0 && Mask[i] != i)
8473 /// \brief Test whether there are elements crossing 128-bit lanes in this
8476 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8477 /// and we routinely test for these.
8478 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8479 int LaneSize = 128 / VT.getScalarSizeInBits();
8480 int Size = Mask.size();
8481 for (int i = 0; i < Size; ++i)
8482 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8487 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8489 /// This checks a shuffle mask to see if it is performing the same
8490 /// lane-relative shuffle in each sub-lane. This trivially implies
8491 /// that it is also not lane-crossing. It may however involve a blend from the
8492 /// same lane of a second vector.
8494 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8495 /// non-trivial to compute in the face of undef lanes. The representation is
8496 /// suitable for use with existing 128-bit shuffles as entries from the second
8497 /// vector have been remapped to [LaneSize, 2*LaneSize).
8498 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8500 SmallVectorImpl<int> &RepeatedMask) {
8501 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8502 RepeatedMask.assign(LaneSize, -1);
8503 int Size = Mask.size();
8504 for (int i = 0; i < Size; ++i) {
8505 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8508 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8509 // This entry crosses lanes, so there is no way to model this shuffle.
8512 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8513 // Adjust second vector indices to start at LaneSize instead of Size.
8514 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8515 : Mask[i] % LaneSize + LaneSize;
8516 if (RepeatedMask[i % LaneSize] < 0)
8517 // This is the first non-undef entry in this slot of a 128-bit lane.
8518 RepeatedMask[i % LaneSize] = LocalM;
8519 else if (RepeatedMask[i % LaneSize] != LocalM)
8520 // Found a mismatch with the repeated mask.
8526 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8528 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8529 SmallVectorImpl<int> &RepeatedMask) {
8530 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8533 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8535 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8536 SmallVectorImpl<int> &RepeatedMask) {
8537 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8540 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8541 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8542 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8544 SmallVectorImpl<int> &RepeatedMask) {
8545 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8546 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8547 int Size = Mask.size();
8548 for (int i = 0; i < Size; ++i) {
8549 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8550 if (Mask[i] == SM_SentinelUndef)
8552 if (Mask[i] == SM_SentinelZero) {
8553 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8555 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8558 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8559 // This entry crosses lanes, so there is no way to model this shuffle.
8562 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8563 // Adjust second vector indices to start at LaneSize instead of Size.
8565 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8566 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8567 // This is the first non-undef entry in this slot of a 128-bit lane.
8568 RepeatedMask[i % LaneSize] = LocalM;
8569 else if (RepeatedMask[i % LaneSize] != LocalM)
8570 // Found a mismatch with the repeated mask.
8576 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8579 /// This is a fast way to test a shuffle mask against a fixed pattern:
8581 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8583 /// It returns true if the mask is exactly as wide as the argument list, and
8584 /// each element of the mask is either -1 (signifying undef) or the value given
8585 /// in the argument.
8586 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8587 ArrayRef<int> ExpectedMask) {
8588 if (Mask.size() != ExpectedMask.size())
8591 int Size = Mask.size();
8593 // If the values are build vectors, we can look through them to find
8594 // equivalent inputs that make the shuffles equivalent.
8595 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8596 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8598 for (int i = 0; i < Size; ++i) {
8599 assert(Mask[i] >= -1 && "Out of bound mask element!");
8600 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8601 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8602 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8603 if (!MaskBV || !ExpectedBV ||
8604 MaskBV->getOperand(Mask[i] % Size) !=
8605 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8613 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8615 /// The masks must be exactly the same width.
8617 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8618 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8620 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8621 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8622 ArrayRef<int> ExpectedMask) {
8623 int Size = Mask.size();
8624 if (Size != (int)ExpectedMask.size())
8627 for (int i = 0; i < Size; ++i)
8628 if (Mask[i] == SM_SentinelUndef)
8630 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8632 else if (Mask[i] != ExpectedMask[i])
8638 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8640 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8641 const APInt &Zeroable) {
8642 int NumElts = Mask.size();
8643 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8645 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8646 for (int i = 0; i != NumElts; ++i) {
8648 if (M == SM_SentinelUndef)
8650 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8651 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8656 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8658 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8659 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8662 SmallVector<int, 8> Unpcklwd;
8663 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8664 /* Unary = */ false);
8665 SmallVector<int, 8> Unpckhwd;
8666 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8667 /* Unary = */ false);
8668 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8669 isTargetShuffleEquivalent(Mask, Unpckhwd));
8670 return IsUnpackwdMask;
8673 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8675 /// This helper function produces an 8-bit shuffle immediate corresponding to
8676 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8677 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8680 /// NB: We rely heavily on "undef" masks preserving the input lane.
8681 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8682 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8683 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8684 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8685 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8686 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8689 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8690 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8691 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8692 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8696 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8697 SelectionDAG &DAG) {
8698 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8701 /// \brief Compute whether each element of a shuffle is zeroable.
8703 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8704 /// Either it is an undef element in the shuffle mask, the element of the input
8705 /// referenced is undef, or the element of the input referenced is known to be
8706 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8707 /// as many lanes with this technique as possible to simplify the remaining
8709 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8710 SDValue V1, SDValue V2) {
8711 APInt Zeroable(Mask.size(), 0);
8712 V1 = peekThroughBitcasts(V1);
8713 V2 = peekThroughBitcasts(V2);
8715 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8716 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8718 int VectorSizeInBits = V1.getValueSizeInBits();
8719 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8720 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8722 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8724 // Handle the easy cases.
8725 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8730 // Determine shuffle input and normalize the mask.
8731 SDValue V = M < Size ? V1 : V2;
8734 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8735 if (V.getOpcode() != ISD::BUILD_VECTOR)
8738 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8739 // the (larger) source element must be UNDEF/ZERO.
8740 if ((Size % V.getNumOperands()) == 0) {
8741 int Scale = Size / V->getNumOperands();
8742 SDValue Op = V.getOperand(M / Scale);
8743 if (Op.isUndef() || X86::isZeroNode(Op))
8745 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8746 APInt Val = Cst->getAPIntValue();
8747 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8748 Val = Val.getLoBits(ScalarSizeInBits);
8751 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8752 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8753 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8754 Val = Val.getLoBits(ScalarSizeInBits);
8761 // If the BUILD_VECTOR has more elements then all the (smaller) source
8762 // elements must be UNDEF or ZERO.
8763 if ((V.getNumOperands() % Size) == 0) {
8764 int Scale = V->getNumOperands() / Size;
8765 bool AllZeroable = true;
8766 for (int j = 0; j < Scale; ++j) {
8767 SDValue Op = V.getOperand((M * Scale) + j);
8768 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8779 // The Shuffle result is as follow:
8780 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8781 // Each Zeroable's element correspond to a particular Mask's element.
8782 // As described in computeZeroableShuffleElements function.
8784 // The function looks for a sub-mask that the nonzero elements are in
8785 // increasing order. If such sub-mask exist. The function returns true.
8786 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8787 ArrayRef<int> Mask, const EVT &VectorType,
8788 bool &IsZeroSideLeft) {
8789 int NextElement = -1;
8790 // Check if the Mask's nonzero elements are in increasing order.
8791 for (int i = 0, e = Mask.size(); i < e; i++) {
8792 // Checks if the mask's zeros elements are built from only zeros.
8793 assert(Mask[i] >= -1 && "Out of bound mask element!");
8798 // Find the lowest non zero element
8799 if (NextElement < 0) {
8800 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8801 IsZeroSideLeft = NextElement != 0;
8803 // Exit if the mask's non zero elements are not in increasing order.
8804 if (NextElement != Mask[i])
8811 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8812 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8813 ArrayRef<int> Mask, SDValue V1,
8815 const APInt &Zeroable,
8816 const X86Subtarget &Subtarget,
8817 SelectionDAG &DAG) {
8818 int Size = Mask.size();
8819 int LaneSize = 128 / VT.getScalarSizeInBits();
8820 const int NumBytes = VT.getSizeInBits() / 8;
8821 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8823 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8824 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8825 (Subtarget.hasBWI() && VT.is512BitVector()));
8827 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8828 // Sign bit set in i8 mask means zero element.
8829 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8832 for (int i = 0; i < NumBytes; ++i) {
8833 int M = Mask[i / NumEltBytes];
8835 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8838 if (Zeroable[i / NumEltBytes]) {
8839 PSHUFBMask[i] = ZeroMask;
8843 // We can only use a single input of V1 or V2.
8844 SDValue SrcV = (M >= Size ? V2 : V1);
8850 // PSHUFB can't cross lanes, ensure this doesn't happen.
8851 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8855 M = M * NumEltBytes + (i % NumEltBytes);
8856 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8858 assert(V && "Failed to find a source input");
8860 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8861 return DAG.getBitcast(
8862 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8863 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8866 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8867 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8870 // X86 has dedicated shuffle that can be lowered to VEXPAND
8871 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8872 const APInt &Zeroable,
8873 ArrayRef<int> Mask, SDValue &V1,
8874 SDValue &V2, SelectionDAG &DAG,
8875 const X86Subtarget &Subtarget) {
8876 bool IsLeftZeroSide = true;
8877 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8880 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8882 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8883 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8884 unsigned NumElts = VT.getVectorNumElements();
8885 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8886 "Unexpected number of vector elements");
8887 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8888 Subtarget, DAG, DL);
8889 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8890 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8891 return DAG.getSelect(DL, VT, VMask,
8892 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8896 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8897 unsigned &UnpackOpcode, bool IsUnary,
8898 ArrayRef<int> TargetMask, SDLoc &DL,
8900 const X86Subtarget &Subtarget) {
8901 int NumElts = VT.getVectorNumElements();
8903 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8904 for (int i = 0; i != NumElts; i += 2) {
8905 int M1 = TargetMask[i + 0];
8906 int M2 = TargetMask[i + 1];
8907 Undef1 &= (SM_SentinelUndef == M1);
8908 Undef2 &= (SM_SentinelUndef == M2);
8909 Zero1 &= isUndefOrZero(M1);
8910 Zero2 &= isUndefOrZero(M2);
8912 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8913 "Zeroable shuffle detected");
8915 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8916 SmallVector<int, 64> Unpckl, Unpckh;
8917 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8918 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8919 UnpackOpcode = X86ISD::UNPCKL;
8920 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8921 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8925 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8926 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8927 UnpackOpcode = X86ISD::UNPCKH;
8928 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8929 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8933 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8934 if (IsUnary && (Zero1 || Zero2)) {
8935 // Don't bother if we can blend instead.
8936 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8937 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8940 bool MatchLo = true, MatchHi = true;
8941 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8942 int M = TargetMask[i];
8944 // Ignore if the input is known to be zero or the index is undef.
8945 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8946 (M == SM_SentinelUndef))
8949 MatchLo &= (M == Unpckl[i]);
8950 MatchHi &= (M == Unpckh[i]);
8953 if (MatchLo || MatchHi) {
8954 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8955 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8956 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8961 // If a binary shuffle, commute and try again.
8963 ShuffleVectorSDNode::commuteMask(Unpckl);
8964 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8965 UnpackOpcode = X86ISD::UNPCKL;
8970 ShuffleVectorSDNode::commuteMask(Unpckh);
8971 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8972 UnpackOpcode = X86ISD::UNPCKH;
8981 // X86 has dedicated unpack instructions that can handle specific blend
8982 // operations: UNPCKH and UNPCKL.
8983 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8984 ArrayRef<int> Mask, SDValue V1,
8985 SDValue V2, SelectionDAG &DAG) {
8986 SmallVector<int, 8> Unpckl;
8987 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8988 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8989 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8991 SmallVector<int, 8> Unpckh;
8992 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8993 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8994 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8996 // Commute and try again.
8997 ShuffleVectorSDNode::commuteMask(Unpckl);
8998 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8999 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9001 ShuffleVectorSDNode::commuteMask(Unpckh);
9002 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9003 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
9008 // X86 has dedicated pack instructions that can handle specific truncation
9009 // operations: PACKSS and PACKUS.
9010 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
9011 SDValue &V2, unsigned &PackOpcode,
9012 ArrayRef<int> TargetMask,
9014 const X86Subtarget &Subtarget) {
9015 unsigned NumElts = VT.getVectorNumElements();
9016 unsigned BitSize = VT.getScalarSizeInBits();
9017 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
9018 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
9020 auto MatchPACK = [&](SDValue N1, SDValue N2) {
9021 SDValue VV1 = DAG.getBitcast(PackVT, N1);
9022 SDValue VV2 = DAG.getBitcast(PackVT, N2);
9023 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
9024 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
9028 PackOpcode = X86ISD::PACKSS;
9032 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
9033 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
9034 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
9035 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
9039 PackOpcode = X86ISD::PACKUS;
9047 // Try binary shuffle.
9048 SmallVector<int, 32> BinaryMask;
9049 createPackShuffleMask(VT, BinaryMask, false);
9050 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
9051 if (MatchPACK(V1, V2))
9054 // Try unary shuffle.
9055 SmallVector<int, 32> UnaryMask;
9056 createPackShuffleMask(VT, UnaryMask, true);
9057 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
9058 if (MatchPACK(V1, V1))
9064 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
9065 ArrayRef<int> Mask, SDValue V1,
9066 SDValue V2, SelectionDAG &DAG,
9067 const X86Subtarget &Subtarget) {
9069 unsigned PackOpcode;
9070 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
9072 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
9073 DAG.getBitcast(PackVT, V2));
9078 /// \brief Try to emit a bitmask instruction for a shuffle.
9080 /// This handles cases where we can model a blend exactly as a bitmask due to
9081 /// one of the inputs being zeroable.
9082 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
9083 SDValue V2, ArrayRef<int> Mask,
9084 const APInt &Zeroable,
9085 SelectionDAG &DAG) {
9086 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
9087 MVT EltVT = VT.getVectorElementType();
9088 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9089 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9090 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
9092 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9095 if (Mask[i] % Size != i)
9096 return SDValue(); // Not a blend.
9098 V = Mask[i] < Size ? V1 : V2;
9099 else if (V != (Mask[i] < Size ? V1 : V2))
9100 return SDValue(); // Can only let one input through the mask.
9102 VMaskOps[i] = AllOnes;
9105 return SDValue(); // No non-zeroable elements!
9107 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
9108 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
9111 /// \brief Try to emit a blend instruction for a shuffle using bit math.
9113 /// This is used as a fallback approach when first class blend instructions are
9114 /// unavailable. Currently it is only suitable for integer vectors, but could
9115 /// be generalized for floating point vectors if desirable.
9116 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
9117 SDValue V2, ArrayRef<int> Mask,
9118 SelectionDAG &DAG) {
9119 assert(VT.isInteger() && "Only supports integer vector types!");
9120 MVT EltVT = VT.getVectorElementType();
9121 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9122 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9123 SmallVector<SDValue, 16> MaskOps;
9124 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9125 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
9126 return SDValue(); // Shuffled input!
9127 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
9130 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
9131 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
9132 // We have to cast V2 around.
9133 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
9134 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
9135 DAG.getBitcast(MaskVT, V1Mask),
9136 DAG.getBitcast(MaskVT, V2)));
9137 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
9140 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
9141 SDValue PreservedSrc,
9142 const X86Subtarget &Subtarget,
9145 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
9146 MutableArrayRef<int> TargetMask,
9147 bool &ForceV1Zero, bool &ForceV2Zero,
9148 uint64_t &BlendMask) {
9149 bool V1IsZeroOrUndef =
9150 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
9151 bool V2IsZeroOrUndef =
9152 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
9155 ForceV1Zero = false, ForceV2Zero = false;
9156 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
9158 // Attempt to generate the binary blend mask. If an input is zero then
9159 // we can use any lane.
9160 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
9161 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
9162 int M = TargetMask[i];
9163 if (M == SM_SentinelUndef)
9167 if (M == i + Size) {
9168 BlendMask |= 1ull << i;
9171 if (M == SM_SentinelZero) {
9172 if (V1IsZeroOrUndef) {
9177 if (V2IsZeroOrUndef) {
9179 BlendMask |= 1ull << i;
9180 TargetMask[i] = i + Size;
9189 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
9191 uint64_t ScaledMask = 0;
9192 for (int i = 0; i != Size; ++i)
9193 if (BlendMask & (1ull << i))
9194 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
9198 /// \brief Try to emit a blend instruction for a shuffle.
9200 /// This doesn't do any checks for the availability of instructions for blending
9201 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
9202 /// be matched in the backend with the type given. What it does check for is
9203 /// that the shuffle mask is a blend, or convertible into a blend with zero.
9204 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
9205 SDValue V2, ArrayRef<int> Original,
9206 const APInt &Zeroable,
9207 const X86Subtarget &Subtarget,
9208 SelectionDAG &DAG) {
9209 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
9211 uint64_t BlendMask = 0;
9212 bool ForceV1Zero = false, ForceV2Zero = false;
9213 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
9217 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
9219 V1 = getZeroVector(VT, Subtarget, DAG, DL);
9221 V2 = getZeroVector(VT, Subtarget, DAG, DL);
9223 switch (VT.SimpleTy) {
9228 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
9229 DAG.getConstant(BlendMask, DL, MVT::i8));
9233 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9237 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
9238 // that instruction.
9239 if (Subtarget.hasAVX2()) {
9240 // Scale the blend by the number of 32-bit dwords per element.
9241 int Scale = VT.getScalarSizeInBits() / 32;
9242 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9243 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
9244 V1 = DAG.getBitcast(BlendVT, V1);
9245 V2 = DAG.getBitcast(BlendVT, V2);
9246 return DAG.getBitcast(
9247 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
9248 DAG.getConstant(BlendMask, DL, MVT::i8)));
9252 // For integer shuffles we need to expand the mask and cast the inputs to
9253 // v8i16s prior to blending.
9254 int Scale = 8 / VT.getVectorNumElements();
9255 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9256 V1 = DAG.getBitcast(MVT::v8i16, V1);
9257 V2 = DAG.getBitcast(MVT::v8i16, V2);
9258 return DAG.getBitcast(VT,
9259 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
9260 DAG.getConstant(BlendMask, DL, MVT::i8)));
9264 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9265 SmallVector<int, 8> RepeatedMask;
9266 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9267 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
9268 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
9270 for (int i = 0; i < 8; ++i)
9271 if (RepeatedMask[i] >= 8)
9272 BlendMask |= 1ull << i;
9273 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
9274 DAG.getConstant(BlendMask, DL, MVT::i8));
9280 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
9281 "256-bit byte-blends require AVX2 support!");
9283 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
9285 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9286 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9287 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9290 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
9291 if (SDValue Masked =
9292 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
9295 // Scale the blend by the number of bytes per element.
9296 int Scale = VT.getScalarSizeInBits() / 8;
9298 // This form of blend is always done on bytes. Compute the byte vector
9300 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9302 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
9303 // mix of LLVM's code generator and the x86 backend. We tell the code
9304 // generator that boolean values in the elements of an x86 vector register
9305 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
9306 // mapping a select to operand #1, and 'false' mapping to operand #2. The
9307 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
9308 // of the element (the remaining are ignored) and 0 in that high bit would
9309 // mean operand #1 while 1 in the high bit would mean operand #2. So while
9310 // the LLVM model for boolean values in vector elements gets the relevant
9311 // bit set, it is set backwards and over constrained relative to x86's
9313 SmallVector<SDValue, 32> VSELECTMask;
9314 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9315 for (int j = 0; j < Scale; ++j)
9316 VSELECTMask.push_back(
9317 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
9318 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
9321 V1 = DAG.getBitcast(BlendVT, V1);
9322 V2 = DAG.getBitcast(BlendVT, V2);
9323 return DAG.getBitcast(
9325 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
9335 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9336 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9337 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9340 llvm_unreachable("Not a supported integer vector type!");
9344 /// \brief Try to lower as a blend of elements from two inputs followed by
9345 /// a single-input permutation.
9347 /// This matches the pattern where we can blend elements from two inputs and
9348 /// then reduce the shuffle to a single-input permutation.
9349 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
9350 SDValue V1, SDValue V2,
9352 SelectionDAG &DAG) {
9353 // We build up the blend mask while checking whether a blend is a viable way
9354 // to reduce the shuffle.
9355 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9356 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9358 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9362 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9364 if (BlendMask[Mask[i] % Size] < 0)
9365 BlendMask[Mask[i] % Size] = Mask[i];
9366 else if (BlendMask[Mask[i] % Size] != Mask[i])
9367 return SDValue(); // Can't blend in the needed input!
9369 PermuteMask[i] = Mask[i] % Size;
9372 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9373 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9376 /// \brief Generic routine to decompose a shuffle and blend into independent
9377 /// blends and permutes.
9379 /// This matches the extremely common pattern for handling combined
9380 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9381 /// operations. It will try to pick the best arrangement of shuffles and
9383 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9387 SelectionDAG &DAG) {
9388 // Shuffle the input elements into the desired positions in V1 and V2 and
9389 // blend them together.
9390 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9391 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9392 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9393 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9394 if (Mask[i] >= 0 && Mask[i] < Size) {
9395 V1Mask[i] = Mask[i];
9397 } else if (Mask[i] >= Size) {
9398 V2Mask[i] = Mask[i] - Size;
9399 BlendMask[i] = i + Size;
9402 // Try to lower with the simpler initial blend strategy unless one of the
9403 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9404 // shuffle may be able to fold with a load or other benefit. However, when
9405 // we'll have to do 2x as many shuffles in order to achieve this, blending
9406 // first is a better strategy.
9407 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9408 if (SDValue BlendPerm =
9409 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9412 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9413 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9414 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9417 /// \brief Try to lower a vector shuffle as a rotation.
9419 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9420 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9421 ArrayRef<int> Mask) {
9422 int NumElts = Mask.size();
9424 // We need to detect various ways of spelling a rotation:
9425 // [11, 12, 13, 14, 15, 0, 1, 2]
9426 // [-1, 12, 13, 14, -1, -1, 1, -1]
9427 // [-1, -1, -1, -1, -1, -1, 1, 2]
9428 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9429 // [-1, 4, 5, 6, -1, -1, 9, -1]
9430 // [-1, 4, 5, 6, -1, -1, -1, -1]
9433 for (int i = 0; i < NumElts; ++i) {
9435 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9436 "Unexpected mask index.");
9440 // Determine where a rotated vector would have started.
9441 int StartIdx = i - (M % NumElts);
9443 // The identity rotation isn't interesting, stop.
9446 // If we found the tail of a vector the rotation must be the missing
9447 // front. If we found the head of a vector, it must be how much of the
9449 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9452 Rotation = CandidateRotation;
9453 else if (Rotation != CandidateRotation)
9454 // The rotations don't match, so we can't match this mask.
9457 // Compute which value this mask is pointing at.
9458 SDValue MaskV = M < NumElts ? V1 : V2;
9460 // Compute which of the two target values this index should be assigned
9461 // to. This reflects whether the high elements are remaining or the low
9462 // elements are remaining.
9463 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9465 // Either set up this value if we've not encountered it before, or check
9466 // that it remains consistent.
9469 else if (TargetV != MaskV)
9470 // This may be a rotation, but it pulls from the inputs in some
9471 // unsupported interleaving.
9475 // Check that we successfully analyzed the mask, and normalize the results.
9476 assert(Rotation != 0 && "Failed to locate a viable rotation!");
9477 assert((Lo || Hi) && "Failed to find a rotated input vector!");
9489 /// \brief Try to lower a vector shuffle as a byte rotation.
9491 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9492 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9493 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9494 /// try to generically lower a vector shuffle through such an pattern. It
9495 /// does not check for the profitability of lowering either as PALIGNR or
9496 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9497 /// This matches shuffle vectors that look like:
9499 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9501 /// Essentially it concatenates V1 and V2, shifts right by some number of
9502 /// elements, and takes the low elements as the result. Note that while this is
9503 /// specified as a *right shift* because x86 is little-endian, it is a *left
9504 /// rotate* of the vector lanes.
9505 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9506 ArrayRef<int> Mask) {
9507 // Don't accept any shuffles with zero elements.
9508 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9511 // PALIGNR works on 128-bit lanes.
9512 SmallVector<int, 16> RepeatedMask;
9513 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9516 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9520 // PALIGNR rotates bytes, so we need to scale the
9521 // rotation based on how many bytes are in the vector lane.
9522 int NumElts = RepeatedMask.size();
9523 int Scale = 16 / NumElts;
9524 return Rotation * Scale;
9527 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9528 SDValue V1, SDValue V2,
9530 const X86Subtarget &Subtarget,
9531 SelectionDAG &DAG) {
9532 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9534 SDValue Lo = V1, Hi = V2;
9535 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9536 if (ByteRotation <= 0)
9539 // Cast the inputs to i8 vector of correct length to match PALIGNR or
9541 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9542 Lo = DAG.getBitcast(ByteVT, Lo);
9543 Hi = DAG.getBitcast(ByteVT, Hi);
9545 // SSSE3 targets can use the palignr instruction.
9546 if (Subtarget.hasSSSE3()) {
9547 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9548 "512-bit PALIGNR requires BWI instructions");
9549 return DAG.getBitcast(
9550 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9551 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9554 assert(VT.is128BitVector() &&
9555 "Rotate-based lowering only supports 128-bit lowering!");
9556 assert(Mask.size() <= 16 &&
9557 "Can shuffle at most 16 bytes in a 128-bit vector!");
9558 assert(ByteVT == MVT::v16i8 &&
9559 "SSE2 rotate lowering only needed for v16i8!");
9561 // Default SSE2 implementation
9562 int LoByteShift = 16 - ByteRotation;
9563 int HiByteShift = ByteRotation;
9565 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9566 DAG.getConstant(LoByteShift, DL, MVT::i8));
9567 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9568 DAG.getConstant(HiByteShift, DL, MVT::i8));
9569 return DAG.getBitcast(VT,
9570 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9573 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9575 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9576 /// rotation of the concatenation of two vectors; This routine will
9577 /// try to generically lower a vector shuffle through such an pattern.
9579 /// Essentially it concatenates V1 and V2, shifts right by some number of
9580 /// elements, and takes the low elements as the result. Note that while this is
9581 /// specified as a *right shift* because x86 is little-endian, it is a *left
9582 /// rotate* of the vector lanes.
9583 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9584 SDValue V1, SDValue V2,
9586 const X86Subtarget &Subtarget,
9587 SelectionDAG &DAG) {
9588 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9589 "Only 32-bit and 64-bit elements are supported!");
9591 // 128/256-bit vectors are only supported with VLX.
9592 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9593 && "VLX required for 128/256-bit vectors");
9595 SDValue Lo = V1, Hi = V2;
9596 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9600 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9601 DAG.getConstant(Rotation, DL, MVT::i8));
9604 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9606 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9607 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9608 /// matches elements from one of the input vectors shuffled to the left or
9609 /// right with zeroable elements 'shifted in'. It handles both the strictly
9610 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9613 /// PSHL : (little-endian) left bit shift.
9614 /// [ zz, 0, zz, 2 ]
9615 /// [ -1, 4, zz, -1 ]
9616 /// PSRL : (little-endian) right bit shift.
9618 /// [ -1, -1, 7, zz]
9619 /// PSLLDQ : (little-endian) left byte shift
9620 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9621 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9622 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9623 /// PSRLDQ : (little-endian) right byte shift
9624 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9625 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9626 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9627 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9628 unsigned ScalarSizeInBits,
9629 ArrayRef<int> Mask, int MaskOffset,
9630 const APInt &Zeroable,
9631 const X86Subtarget &Subtarget) {
9632 int Size = Mask.size();
9633 unsigned SizeInBits = Size * ScalarSizeInBits;
9635 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9636 for (int i = 0; i < Size; i += Scale)
9637 for (int j = 0; j < Shift; ++j)
9638 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9644 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9645 for (int i = 0; i != Size; i += Scale) {
9646 unsigned Pos = Left ? i + Shift : i;
9647 unsigned Low = Left ? i : i + Shift;
9648 unsigned Len = Scale - Shift;
9649 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9653 int ShiftEltBits = ScalarSizeInBits * Scale;
9654 bool ByteShift = ShiftEltBits > 64;
9655 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9656 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9657 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9659 // Normalize the scale for byte shifts to still produce an i64 element
9661 Scale = ByteShift ? Scale / 2 : Scale;
9663 // We need to round trip through the appropriate type for the shift.
9664 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9665 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9666 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9667 return (int)ShiftAmt;
9670 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9671 // keep doubling the size of the integer elements up to that. We can
9672 // then shift the elements of the integer vector by whole multiples of
9673 // their width within the elements of the larger integer vector. Test each
9674 // multiple to see if we can find a match with the moved element indices
9675 // and that the shifted in elements are all zeroable.
9676 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9677 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9678 for (int Shift = 1; Shift != Scale; ++Shift)
9679 for (bool Left : {true, false})
9680 if (CheckZeros(Shift, Scale, Left)) {
9681 int ShiftAmt = MatchShift(Shift, Scale, Left);
9690 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9691 SDValue V2, ArrayRef<int> Mask,
9692 const APInt &Zeroable,
9693 const X86Subtarget &Subtarget,
9694 SelectionDAG &DAG) {
9695 int Size = Mask.size();
9696 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9702 // Try to match shuffle against V1 shift.
9703 int ShiftAmt = matchVectorShuffleAsShift(
9704 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9706 // If V1 failed, try to match shuffle against V2 shift.
9709 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9710 Mask, Size, Zeroable, Subtarget);
9717 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9718 "Illegal integer vector type");
9719 V = DAG.getBitcast(ShiftVT, V);
9720 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9721 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9722 return DAG.getBitcast(VT, V);
9725 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9726 // Remainder of lower half result is zero and upper half is all undef.
9727 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
9728 ArrayRef<int> Mask, uint64_t &BitLen,
9729 uint64_t &BitIdx, const APInt &Zeroable) {
9730 int Size = Mask.size();
9731 int HalfSize = Size / 2;
9732 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9733 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9735 // Upper half must be undefined.
9736 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9739 // Determine the extraction length from the part of the
9740 // lower half that isn't zeroable.
9742 for (; Len > 0; --Len)
9743 if (!Zeroable[Len - 1])
9745 assert(Len > 0 && "Zeroable shuffle mask");
9747 // Attempt to match first Len sequential elements from the lower half.
9750 for (int i = 0; i != Len; ++i) {
9752 if (M == SM_SentinelUndef)
9754 SDValue &V = (M < Size ? V1 : V2);
9757 // The extracted elements must start at a valid index and all mask
9758 // elements must be in the lower half.
9759 if (i > M || M >= HalfSize)
9762 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9770 if (!Src || Idx < 0)
9773 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9774 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9775 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9780 // INSERTQ: Extract lowest Len elements from lower half of second source and
9781 // insert over first source, starting at Idx.
9782 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9783 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
9784 ArrayRef<int> Mask, uint64_t &BitLen,
9786 int Size = Mask.size();
9787 int HalfSize = Size / 2;
9788 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9790 // Upper half must be undefined.
9791 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9794 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9797 // Attempt to match first source from mask before insertion point.
9798 if (isUndefInRange(Mask, 0, Idx)) {
9800 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9802 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9808 // Extend the extraction length looking to match both the insertion of
9809 // the second source and the remaining elements of the first.
9810 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9815 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9817 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9823 // Match the remaining elements of the lower half.
9824 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9826 } else if ((!Base || (Base == V1)) &&
9827 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9829 } else if ((!Base || (Base == V2)) &&
9830 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9837 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9838 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9848 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9849 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9850 SDValue V2, ArrayRef<int> Mask,
9851 const APInt &Zeroable,
9852 SelectionDAG &DAG) {
9853 uint64_t BitLen, BitIdx;
9854 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
9855 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
9856 DAG.getConstant(BitLen, DL, MVT::i8),
9857 DAG.getConstant(BitIdx, DL, MVT::i8));
9859 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
9860 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
9861 V2 ? V2 : DAG.getUNDEF(VT),
9862 DAG.getConstant(BitLen, DL, MVT::i8),
9863 DAG.getConstant(BitIdx, DL, MVT::i8));
9868 /// \brief Lower a vector shuffle as a zero or any extension.
9870 /// Given a specific number of elements, element bit width, and extension
9871 /// stride, produce either a zero or any extension based on the available
9872 /// features of the subtarget. The extended elements are consecutive and
9873 /// begin and can start from an offsetted element index in the input; to
9874 /// avoid excess shuffling the offset must either being in the bottom lane
9875 /// or at the start of a higher lane. All extended elements must be from
9877 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9878 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9879 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9880 assert(Scale > 1 && "Need a scale to extend.");
9881 int EltBits = VT.getScalarSizeInBits();
9882 int NumElements = VT.getVectorNumElements();
9883 int NumEltsPerLane = 128 / EltBits;
9884 int OffsetLane = Offset / NumEltsPerLane;
9885 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9886 "Only 8, 16, and 32 bit elements can be extended.");
9887 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9888 assert(0 <= Offset && "Extension offset must be positive.");
9889 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9890 "Extension offset must be in the first lane or start an upper lane.");
9892 // Check that an index is in same lane as the base offset.
9893 auto SafeOffset = [&](int Idx) {
9894 return OffsetLane == (Idx / NumEltsPerLane);
9897 // Shift along an input so that the offset base moves to the first element.
9898 auto ShuffleOffset = [&](SDValue V) {
9902 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9903 for (int i = 0; i * Scale < NumElements; ++i) {
9904 int SrcIdx = i + Offset;
9905 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9907 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9910 // Found a valid zext mask! Try various lowering strategies based on the
9911 // input type and available ISA extensions.
9912 if (Subtarget.hasSSE41()) {
9913 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9914 // PUNPCK will catch this in a later shuffle match.
9915 if (Offset && Scale == 2 && VT.is128BitVector())
9917 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9918 NumElements / Scale);
9919 InputV = ShuffleOffset(InputV);
9920 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9921 return DAG.getBitcast(VT, InputV);
9924 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9926 // For any extends we can cheat for larger element sizes and use shuffle
9927 // instructions that can fold with a load and/or copy.
9928 if (AnyExt && EltBits == 32) {
9929 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9931 return DAG.getBitcast(
9932 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9933 DAG.getBitcast(MVT::v4i32, InputV),
9934 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9936 if (AnyExt && EltBits == 16 && Scale > 2) {
9937 int PSHUFDMask[4] = {Offset / 2, -1,
9938 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9939 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9940 DAG.getBitcast(MVT::v4i32, InputV),
9941 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9942 int PSHUFWMask[4] = {1, -1, -1, -1};
9943 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9944 return DAG.getBitcast(
9945 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9946 DAG.getBitcast(MVT::v8i16, InputV),
9947 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9950 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9952 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9953 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9954 assert(VT.is128BitVector() && "Unexpected vector width!");
9956 int LoIdx = Offset * EltBits;
9957 SDValue Lo = DAG.getBitcast(
9958 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9959 DAG.getConstant(EltBits, DL, MVT::i8),
9960 DAG.getConstant(LoIdx, DL, MVT::i8)));
9962 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9963 !SafeOffset(Offset + 1))
9964 return DAG.getBitcast(VT, Lo);
9966 int HiIdx = (Offset + 1) * EltBits;
9967 SDValue Hi = DAG.getBitcast(
9968 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9969 DAG.getConstant(EltBits, DL, MVT::i8),
9970 DAG.getConstant(HiIdx, DL, MVT::i8)));
9971 return DAG.getBitcast(VT,
9972 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9975 // If this would require more than 2 unpack instructions to expand, use
9976 // pshufb when available. We can only use more than 2 unpack instructions
9977 // when zero extending i8 elements which also makes it easier to use pshufb.
9978 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9979 assert(NumElements == 16 && "Unexpected byte vector width!");
9980 SDValue PSHUFBMask[16];
9981 for (int i = 0; i < 16; ++i) {
9982 int Idx = Offset + (i / Scale);
9983 PSHUFBMask[i] = DAG.getConstant(
9984 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9986 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9987 return DAG.getBitcast(
9988 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9989 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9992 // If we are extending from an offset, ensure we start on a boundary that
9993 // we can unpack from.
9994 int AlignToUnpack = Offset % (NumElements / Scale);
9995 if (AlignToUnpack) {
9996 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9997 for (int i = AlignToUnpack; i < NumElements; ++i)
9998 ShMask[i - AlignToUnpack] = i;
9999 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
10000 Offset -= AlignToUnpack;
10003 // Otherwise emit a sequence of unpacks.
10005 unsigned UnpackLoHi = X86ISD::UNPCKL;
10006 if (Offset >= (NumElements / 2)) {
10007 UnpackLoHi = X86ISD::UNPCKH;
10008 Offset -= (NumElements / 2);
10011 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
10012 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
10013 : getZeroVector(InputVT, Subtarget, DAG, DL);
10014 InputV = DAG.getBitcast(InputVT, InputV);
10015 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
10019 } while (Scale > 1);
10020 return DAG.getBitcast(VT, InputV);
10023 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
10025 /// This routine will try to do everything in its power to cleverly lower
10026 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
10027 /// check for the profitability of this lowering, it tries to aggressively
10028 /// match this pattern. It will use all of the micro-architectural details it
10029 /// can to emit an efficient lowering. It handles both blends with all-zero
10030 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
10031 /// masking out later).
10033 /// The reason we have dedicated lowering for zext-style shuffles is that they
10034 /// are both incredibly common and often quite performance sensitive.
10035 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
10036 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10037 const APInt &Zeroable, const X86Subtarget &Subtarget,
10038 SelectionDAG &DAG) {
10039 int Bits = VT.getSizeInBits();
10040 int NumLanes = Bits / 128;
10041 int NumElements = VT.getVectorNumElements();
10042 int NumEltsPerLane = NumElements / NumLanes;
10043 assert(VT.getScalarSizeInBits() <= 32 &&
10044 "Exceeds 32-bit integer zero extension limit");
10045 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
10047 // Define a helper function to check a particular ext-scale and lower to it if
10049 auto Lower = [&](int Scale) -> SDValue {
10051 bool AnyExt = true;
10054 for (int i = 0; i < NumElements; ++i) {
10057 continue; // Valid anywhere but doesn't tell us anything.
10058 if (i % Scale != 0) {
10059 // Each of the extended elements need to be zeroable.
10063 // We no longer are in the anyext case.
10068 // Each of the base elements needs to be consecutive indices into the
10069 // same input vector.
10070 SDValue V = M < NumElements ? V1 : V2;
10071 M = M % NumElements;
10074 Offset = M - (i / Scale);
10075 } else if (InputV != V)
10076 return SDValue(); // Flip-flopping inputs.
10078 // Offset must start in the lowest 128-bit lane or at the start of an
10080 // FIXME: Is it ever worth allowing a negative base offset?
10081 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
10082 (Offset % NumEltsPerLane) == 0))
10085 // If we are offsetting, all referenced entries must come from the same
10087 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
10090 if ((M % NumElements) != (Offset + (i / Scale)))
10091 return SDValue(); // Non-consecutive strided elements.
10095 // If we fail to find an input, we have a zero-shuffle which should always
10096 // have already been handled.
10097 // FIXME: Maybe handle this here in case during blending we end up with one?
10101 // If we are offsetting, don't extend if we only match a single input, we
10102 // can always do better by using a basic PSHUF or PUNPCK.
10103 if (Offset != 0 && Matches < 2)
10106 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10107 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
10110 // The widest scale possible for extending is to a 64-bit integer.
10111 assert(Bits % 64 == 0 &&
10112 "The number of bits in a vector must be divisible by 64 on x86!");
10113 int NumExtElements = Bits / 64;
10115 // Each iteration, try extending the elements half as much, but into twice as
10117 for (; NumExtElements < NumElements; NumExtElements *= 2) {
10118 assert(NumElements % NumExtElements == 0 &&
10119 "The input vector size must be divisible by the extended size.");
10120 if (SDValue V = Lower(NumElements / NumExtElements))
10124 // General extends failed, but 128-bit vectors may be able to use MOVQ.
10128 // Returns one of the source operands if the shuffle can be reduced to a
10129 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
10130 auto CanZExtLowHalf = [&]() {
10131 for (int i = NumElements / 2; i != NumElements; ++i)
10134 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
10136 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
10141 if (SDValue V = CanZExtLowHalf()) {
10142 V = DAG.getBitcast(MVT::v2i64, V);
10143 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
10144 return DAG.getBitcast(VT, V);
10147 // No viable ext lowering found.
10151 /// \brief Try to get a scalar value for a specific element of a vector.
10153 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
10154 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
10155 SelectionDAG &DAG) {
10156 MVT VT = V.getSimpleValueType();
10157 MVT EltVT = VT.getVectorElementType();
10158 V = peekThroughBitcasts(V);
10160 // If the bitcasts shift the element size, we can't extract an equivalent
10161 // element from it.
10162 MVT NewVT = V.getSimpleValueType();
10163 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
10166 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10167 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
10168 // Ensure the scalar operand is the same size as the destination.
10169 // FIXME: Add support for scalar truncation where possible.
10170 SDValue S = V.getOperand(Idx);
10171 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
10172 return DAG.getBitcast(EltVT, S);
10178 /// \brief Helper to test for a load that can be folded with x86 shuffles.
10180 /// This is particularly important because the set of instructions varies
10181 /// significantly based on whether the operand is a load or not.
10182 static bool isShuffleFoldableLoad(SDValue V) {
10183 V = peekThroughBitcasts(V);
10184 return ISD::isNON_EXTLoad(V.getNode());
10187 /// \brief Try to lower insertion of a single element into a zero vector.
10189 /// This is a common pattern that we have especially efficient patterns to lower
10190 /// across all subtarget feature sets.
10191 static SDValue lowerVectorShuffleAsElementInsertion(
10192 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10193 const APInt &Zeroable, const X86Subtarget &Subtarget,
10194 SelectionDAG &DAG) {
10196 MVT EltVT = VT.getVectorElementType();
10199 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
10201 bool IsV1Zeroable = true;
10202 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10203 if (i != V2Index && !Zeroable[i]) {
10204 IsV1Zeroable = false;
10208 // Check for a single input from a SCALAR_TO_VECTOR node.
10209 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
10210 // all the smarts here sunk into that routine. However, the current
10211 // lowering of BUILD_VECTOR makes that nearly impossible until the old
10212 // vector shuffle lowering is dead.
10213 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
10215 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
10216 // We need to zext the scalar if it is smaller than an i32.
10217 V2S = DAG.getBitcast(EltVT, V2S);
10218 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
10219 // Using zext to expand a narrow element won't work for non-zero
10224 // Zero-extend directly to i32.
10225 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
10226 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
10228 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
10229 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
10230 EltVT == MVT::i16) {
10231 // Either not inserting from the low element of the input or the input
10232 // element size is too small to use VZEXT_MOVL to clear the high bits.
10236 if (!IsV1Zeroable) {
10237 // If V1 can't be treated as a zero vector we have fewer options to lower
10238 // this. We can't support integer vectors or non-zero targets cheaply, and
10239 // the V1 elements can't be permuted in any way.
10240 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
10241 if (!VT.isFloatingPoint() || V2Index != 0)
10243 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
10244 V1Mask[V2Index] = -1;
10245 if (!isNoopShuffleMask(V1Mask))
10247 if (!VT.is128BitVector())
10250 // Otherwise, use MOVSD or MOVSS.
10251 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
10252 "Only two types of floating point element types to handle!");
10253 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
10257 // This lowering only works for the low element with floating point vectors.
10258 if (VT.isFloatingPoint() && V2Index != 0)
10261 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
10263 V2 = DAG.getBitcast(VT, V2);
10265 if (V2Index != 0) {
10266 // If we have 4 or fewer lanes we can cheaply shuffle the element into
10267 // the desired position. Otherwise it is more efficient to do a vector
10268 // shift left. We know that we can do a vector shift left because all
10269 // the inputs are zero.
10270 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
10271 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
10272 V2Shuffle[V2Index] = 0;
10273 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
10275 V2 = DAG.getBitcast(MVT::v16i8, V2);
10277 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
10278 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
10279 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
10280 DAG.getDataLayout(), VT)));
10281 V2 = DAG.getBitcast(VT, V2);
10287 /// Try to lower broadcast of a single - truncated - integer element,
10288 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
10290 /// This assumes we have AVX2.
10291 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
10292 SDValue V0, int BroadcastIdx,
10293 const X86Subtarget &Subtarget,
10294 SelectionDAG &DAG) {
10295 assert(Subtarget.hasAVX2() &&
10296 "We can only lower integer broadcasts with AVX2!");
10298 EVT EltVT = VT.getVectorElementType();
10299 EVT V0VT = V0.getValueType();
10301 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
10302 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
10304 EVT V0EltVT = V0VT.getVectorElementType();
10305 if (!V0EltVT.isInteger())
10308 const unsigned EltSize = EltVT.getSizeInBits();
10309 const unsigned V0EltSize = V0EltVT.getSizeInBits();
10311 // This is only a truncation if the original element type is larger.
10312 if (V0EltSize <= EltSize)
10315 assert(((V0EltSize % EltSize) == 0) &&
10316 "Scalar type sizes must all be powers of 2 on x86!");
10318 const unsigned V0Opc = V0.getOpcode();
10319 const unsigned Scale = V0EltSize / EltSize;
10320 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
10322 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
10323 V0Opc != ISD::BUILD_VECTOR)
10326 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
10328 // If we're extracting non-least-significant bits, shift so we can truncate.
10329 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
10330 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
10331 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
10332 if (const int OffsetIdx = BroadcastIdx % Scale)
10333 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
10334 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
10336 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
10337 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
10340 /// \brief Try to lower broadcast of a single element.
10342 /// For convenience, this code also bundles all of the subtarget feature set
10343 /// filtering. While a little annoying to re-dispatch on type here, there isn't
10344 /// a convenient way to factor it out.
10345 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
10346 SDValue V1, SDValue V2,
10347 ArrayRef<int> Mask,
10348 const X86Subtarget &Subtarget,
10349 SelectionDAG &DAG) {
10350 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10351 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10352 (Subtarget.hasAVX2() && VT.isInteger())))
10355 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10356 // we can only broadcast from a register with AVX2.
10357 unsigned NumElts = Mask.size();
10358 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
10360 : X86ISD::VBROADCAST;
10361 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10363 // Check that the mask is a broadcast.
10364 int BroadcastIdx = -1;
10365 for (int i = 0; i != (int)NumElts; ++i) {
10366 SmallVector<int, 8> BroadcastMask(NumElts, i);
10367 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10373 if (BroadcastIdx < 0)
10375 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10376 "a sorted mask where the broadcast "
10379 // Go up the chain of (vector) values to find a scalar load that we can
10380 // combine with the broadcast.
10383 switch (V.getOpcode()) {
10384 case ISD::BITCAST: {
10385 // Peek through bitcasts as long as BroadcastIdx can be adjusted.
10386 SDValue VSrc = V.getOperand(0);
10387 unsigned NumEltBits = V.getScalarValueSizeInBits();
10388 unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
10389 if ((NumEltBits % NumSrcBits) == 0)
10390 BroadcastIdx *= (NumEltBits / NumSrcBits);
10391 else if ((NumSrcBits % NumEltBits) == 0 &&
10392 (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
10393 BroadcastIdx /= (NumSrcBits / NumEltBits);
10399 case ISD::CONCAT_VECTORS: {
10400 int OperandSize = Mask.size() / V.getNumOperands();
10401 V = V.getOperand(BroadcastIdx / OperandSize);
10402 BroadcastIdx %= OperandSize;
10405 case ISD::INSERT_SUBVECTOR: {
10406 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10407 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10411 int BeginIdx = (int)ConstantIdx->getZExtValue();
10413 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10414 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10415 BroadcastIdx -= BeginIdx;
10426 // Ensure the source vector and BroadcastIdx are for a suitable type.
10427 if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
10428 unsigned NumEltBits = VT.getScalarSizeInBits();
10429 unsigned NumSrcBits = V.getScalarValueSizeInBits();
10430 if ((NumSrcBits % NumEltBits) == 0)
10431 BroadcastIdx *= (NumSrcBits / NumEltBits);
10432 else if ((NumEltBits % NumSrcBits) == 0 &&
10433 (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
10434 BroadcastIdx /= (NumEltBits / NumSrcBits);
10438 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
10439 MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
10440 V = DAG.getBitcast(SrcVT, V);
10443 // Check if this is a broadcast of a scalar. We special case lowering
10444 // for scalars so that we can more effectively fold with loads.
10445 // First, look through bitcast: if the original value has a larger element
10446 // type than the shuffle, the broadcast element is in essence truncated.
10447 // Make that explicit to ease folding.
10448 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10449 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10450 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10451 return TruncBroadcast;
10453 MVT BroadcastVT = VT;
10455 // Peek through any bitcast (only useful for loads).
10456 SDValue BC = peekThroughBitcasts(V);
10458 // Also check the simpler case, where we can directly reuse the scalar.
10459 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10460 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10461 V = V.getOperand(BroadcastIdx);
10463 // If we can't broadcast from a register, check that the input is a load.
10464 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10466 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10467 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10468 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10469 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10470 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
10475 // If we are broadcasting a load that is only used by the shuffle
10476 // then we can reduce the vector load to the broadcasted scalar load.
10477 LoadSDNode *Ld = cast<LoadSDNode>(BC);
10478 SDValue BaseAddr = Ld->getOperand(1);
10479 EVT SVT = BroadcastVT.getScalarType();
10480 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10481 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10482 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10483 DAG.getMachineFunction().getMachineMemOperand(
10484 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10485 DAG.makeEquivalentMemoryOrdering(Ld, V);
10486 } else if (!BroadcastFromReg) {
10487 // We can't broadcast from a vector register.
10489 } else if (BroadcastIdx != 0) {
10490 // We can only broadcast from the zero-element of a vector register,
10491 // but it can be advantageous to broadcast from the zero-element of a
10493 if (!VT.is256BitVector() && !VT.is512BitVector())
10496 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10497 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10500 // Only broadcast the zero-element of a 128-bit subvector.
10501 unsigned EltSize = VT.getScalarSizeInBits();
10502 if (((BroadcastIdx * EltSize) % 128) != 0)
10505 // The shuffle input might have been a bitcast we looked through; look at
10506 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
10507 // later bitcast it to BroadcastVT.
10508 assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10509 "Unexpected vector element size");
10510 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
10511 "Unexpected vector size");
10512 V = extract128BitVector(V, BroadcastIdx, DAG, DL);
10515 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10516 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10517 DAG.getBitcast(MVT::f64, V));
10519 // Bitcast back to the same scalar type as BroadcastVT.
10520 MVT SrcVT = V.getSimpleValueType();
10521 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10522 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10523 "Unexpected vector element size");
10524 if (SrcVT.isVector()) {
10525 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10526 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10528 SrcVT = BroadcastVT.getScalarType();
10530 V = DAG.getBitcast(SrcVT, V);
10533 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10534 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10535 V = DAG.getBitcast(MVT::f64, V);
10536 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10537 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10540 // We only support broadcasting from 128-bit vectors to minimize the
10541 // number of patterns we need to deal with in isel. So extract down to
10542 // 128-bits, removing as many bitcasts as possible.
10543 if (SrcVT.getSizeInBits() > 128) {
10544 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
10545 128 / SrcVT.getScalarSizeInBits());
10546 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
10547 V = DAG.getBitcast(ExtVT, V);
10550 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10553 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10554 // INSERTPS when the V1 elements are already in the correct locations
10555 // because otherwise we can just always use two SHUFPS instructions which
10556 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10557 // perform INSERTPS if a single V1 element is out of place and all V2
10558 // elements are zeroable.
10559 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10560 unsigned &InsertPSMask,
10561 const APInt &Zeroable,
10562 ArrayRef<int> Mask,
10563 SelectionDAG &DAG) {
10564 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10565 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10566 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10568 // Attempt to match INSERTPS with one element from VA or VB being
10569 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10571 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10572 ArrayRef<int> CandidateMask) {
10573 unsigned ZMask = 0;
10574 int VADstIndex = -1;
10575 int VBDstIndex = -1;
10576 bool VAUsedInPlace = false;
10578 for (int i = 0; i < 4; ++i) {
10579 // Synthesize a zero mask from the zeroable elements (includes undefs).
10585 // Flag if we use any VA inputs in place.
10586 if (i == CandidateMask[i]) {
10587 VAUsedInPlace = true;
10591 // We can only insert a single non-zeroable element.
10592 if (VADstIndex >= 0 || VBDstIndex >= 0)
10595 if (CandidateMask[i] < 4) {
10596 // VA input out of place for insertion.
10599 // VB input for insertion.
10604 // Don't bother if we have no (non-zeroable) element for insertion.
10605 if (VADstIndex < 0 && VBDstIndex < 0)
10608 // Determine element insertion src/dst indices. The src index is from the
10609 // start of the inserted vector, not the start of the concatenated vector.
10610 unsigned VBSrcIndex = 0;
10611 if (VADstIndex >= 0) {
10612 // If we have a VA input out of place, we use VA as the V2 element
10613 // insertion and don't use the original V2 at all.
10614 VBSrcIndex = CandidateMask[VADstIndex];
10615 VBDstIndex = VADstIndex;
10618 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10621 // If no V1 inputs are used in place, then the result is created only from
10622 // the zero mask and the V2 insertion - so remove V1 dependency.
10623 if (!VAUsedInPlace)
10624 VA = DAG.getUNDEF(MVT::v4f32);
10626 // Update V1, V2 and InsertPSMask accordingly.
10630 // Insert the V2 element into the desired position.
10631 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10632 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10636 if (matchAsInsertPS(V1, V2, Mask))
10639 // Commute and try again.
10640 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10641 ShuffleVectorSDNode::commuteMask(CommutedMask);
10642 if (matchAsInsertPS(V2, V1, CommutedMask))
10648 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10649 SDValue V2, ArrayRef<int> Mask,
10650 const APInt &Zeroable,
10651 SelectionDAG &DAG) {
10652 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10653 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10655 // Attempt to match the insertps pattern.
10656 unsigned InsertPSMask;
10657 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10660 // Insert the V2 element into the desired position.
10661 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10662 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10665 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10666 /// UNPCK instruction.
10668 /// This specifically targets cases where we end up with alternating between
10669 /// the two inputs, and so can permute them into something that feeds a single
10670 /// UNPCK instruction. Note that this routine only targets integer vectors
10671 /// because for floating point vectors we have a generalized SHUFPS lowering
10672 /// strategy that handles everything that doesn't *exactly* match an unpack,
10673 /// making this clever lowering unnecessary.
10674 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10675 SDValue V1, SDValue V2,
10676 ArrayRef<int> Mask,
10677 SelectionDAG &DAG) {
10678 assert(!VT.isFloatingPoint() &&
10679 "This routine only supports integer vectors.");
10680 assert(VT.is128BitVector() &&
10681 "This routine only works on 128-bit vectors.");
10682 assert(!V2.isUndef() &&
10683 "This routine should only be used when blending two inputs.");
10684 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10686 int Size = Mask.size();
10689 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10691 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10693 bool UnpackLo = NumLoInputs >= NumHiInputs;
10695 auto TryUnpack = [&](int ScalarSize, int Scale) {
10696 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10697 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10699 for (int i = 0; i < Size; ++i) {
10703 // Each element of the unpack contains Scale elements from this mask.
10704 int UnpackIdx = i / Scale;
10706 // We only handle the case where V1 feeds the first slots of the unpack.
10707 // We rely on canonicalization to ensure this is the case.
10708 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10711 // Setup the mask for this input. The indexing is tricky as we have to
10712 // handle the unpack stride.
10713 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10714 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10718 // If we will have to shuffle both inputs to use the unpack, check whether
10719 // we can just unpack first and shuffle the result. If so, skip this unpack.
10720 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10721 !isNoopShuffleMask(V2Mask))
10724 // Shuffle the inputs into place.
10725 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10726 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10728 // Cast the inputs to the type we will use to unpack them.
10729 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10730 V1 = DAG.getBitcast(UnpackVT, V1);
10731 V2 = DAG.getBitcast(UnpackVT, V2);
10733 // Unpack the inputs and cast the result back to the desired type.
10734 return DAG.getBitcast(
10735 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10736 UnpackVT, V1, V2));
10739 // We try each unpack from the largest to the smallest to try and find one
10740 // that fits this mask.
10741 int OrigScalarSize = VT.getScalarSizeInBits();
10742 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10743 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10746 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10748 if (NumLoInputs == 0 || NumHiInputs == 0) {
10749 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10750 "We have to have *some* inputs!");
10751 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10753 // FIXME: We could consider the total complexity of the permute of each
10754 // possible unpacking. Or at the least we should consider how many
10755 // half-crossings are created.
10756 // FIXME: We could consider commuting the unpacks.
10758 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10759 for (int i = 0; i < Size; ++i) {
10763 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10766 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10768 return DAG.getVectorShuffle(
10769 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10771 DAG.getUNDEF(VT), PermMask);
10777 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10779 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10780 /// support for floating point shuffles but not integer shuffles. These
10781 /// instructions will incur a domain crossing penalty on some chips though so
10782 /// it is better to avoid lowering through this for integer vectors where
10784 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10785 const APInt &Zeroable,
10786 SDValue V1, SDValue V2,
10787 const X86Subtarget &Subtarget,
10788 SelectionDAG &DAG) {
10789 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10790 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10791 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10793 if (V2.isUndef()) {
10794 // Check for being able to broadcast a single element.
10795 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10796 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10799 // Straight shuffle of a single input vector. Simulate this by using the
10800 // single input as both of the "inputs" to this instruction..
10801 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10803 if (Subtarget.hasAVX()) {
10804 // If we have AVX, we can use VPERMILPS which will allow folding a load
10805 // into the shuffle.
10806 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10807 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10810 return DAG.getNode(
10811 X86ISD::SHUFP, DL, MVT::v2f64,
10812 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10813 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10814 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10816 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10817 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10819 // If we have a single input, insert that into V1 if we can do so cheaply.
10820 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10821 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10822 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10824 // Try inverting the insertion since for v2 masks it is easy to do and we
10825 // can't reliably sort the mask one way or the other.
10826 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10827 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10828 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10829 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10833 // Try to use one of the special instruction patterns to handle two common
10834 // blend patterns if a zero-blend above didn't work.
10835 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10836 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10837 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10838 // We can either use a special instruction to load over the low double or
10839 // to move just the low double.
10840 return DAG.getNode(
10841 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10842 DL, MVT::v2f64, V2,
10843 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10845 if (Subtarget.hasSSE41())
10846 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10847 Zeroable, Subtarget, DAG))
10850 // Use dedicated unpack instructions for masks that match their pattern.
10852 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10855 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10856 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10857 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10860 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10862 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10863 /// the integer unit to minimize domain crossing penalties. However, for blends
10864 /// it falls back to the floating point shuffle operation with appropriate bit
10866 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10867 const APInt &Zeroable,
10868 SDValue V1, SDValue V2,
10869 const X86Subtarget &Subtarget,
10870 SelectionDAG &DAG) {
10871 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10872 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10873 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10875 if (V2.isUndef()) {
10876 // Check for being able to broadcast a single element.
10877 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10878 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10881 // Straight shuffle of a single input vector. For everything from SSE2
10882 // onward this has a single fast instruction with no scary immediates.
10883 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10884 V1 = DAG.getBitcast(MVT::v4i32, V1);
10885 int WidenedMask[4] = {
10886 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10887 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10888 return DAG.getBitcast(
10890 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10891 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10893 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10894 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10895 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10896 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10898 // Try to use shift instructions.
10899 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10900 Zeroable, Subtarget, DAG))
10903 // When loading a scalar and then shuffling it into a vector we can often do
10904 // the insertion cheaply.
10905 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10906 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10908 // Try inverting the insertion since for v2 masks it is easy to do and we
10909 // can't reliably sort the mask one way or the other.
10910 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10911 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10912 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10915 // We have different paths for blend lowering, but they all must use the
10916 // *exact* same predicate.
10917 bool IsBlendSupported = Subtarget.hasSSE41();
10918 if (IsBlendSupported)
10919 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10920 Zeroable, Subtarget, DAG))
10923 // Use dedicated unpack instructions for masks that match their pattern.
10925 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10928 // Try to use byte rotation instructions.
10929 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10930 if (Subtarget.hasSSSE3()) {
10931 if (Subtarget.hasVLX())
10932 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
10933 Mask, Subtarget, DAG))
10936 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10937 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10941 // If we have direct support for blends, we should lower by decomposing into
10942 // a permute. That will be faster than the domain cross.
10943 if (IsBlendSupported)
10944 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10947 // We implement this with SHUFPD which is pretty lame because it will likely
10948 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10949 // However, all the alternatives are still more cycles and newer chips don't
10950 // have this problem. It would be really nice if x86 had better shuffles here.
10951 V1 = DAG.getBitcast(MVT::v2f64, V1);
10952 V2 = DAG.getBitcast(MVT::v2f64, V2);
10953 return DAG.getBitcast(MVT::v2i64,
10954 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10957 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10959 /// This is used to disable more specialized lowerings when the shufps lowering
10960 /// will happen to be efficient.
10961 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10962 // This routine only handles 128-bit shufps.
10963 assert(Mask.size() == 4 && "Unsupported mask size!");
10964 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10965 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10966 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10967 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10969 // To lower with a single SHUFPS we need to have the low half and high half
10970 // each requiring a single input.
10971 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10973 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10979 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10981 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10982 /// It makes no assumptions about whether this is the *best* lowering, it simply
10984 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10985 ArrayRef<int> Mask, SDValue V1,
10986 SDValue V2, SelectionDAG &DAG) {
10987 SDValue LowV = V1, HighV = V2;
10988 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10990 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10992 if (NumV2Elements == 1) {
10993 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10995 // Compute the index adjacent to V2Index and in the same half by toggling
10997 int V2AdjIndex = V2Index ^ 1;
10999 if (Mask[V2AdjIndex] < 0) {
11000 // Handles all the cases where we have a single V2 element and an undef.
11001 // This will only ever happen in the high lanes because we commute the
11002 // vector otherwise.
11004 std::swap(LowV, HighV);
11005 NewMask[V2Index] -= 4;
11007 // Handle the case where the V2 element ends up adjacent to a V1 element.
11008 // To make this work, blend them together as the first step.
11009 int V1Index = V2AdjIndex;
11010 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
11011 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11012 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11014 // Now proceed to reconstruct the final blend as we have the necessary
11015 // high or low half formed.
11022 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
11023 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
11025 } else if (NumV2Elements == 2) {
11026 if (Mask[0] < 4 && Mask[1] < 4) {
11027 // Handle the easy case where we have V1 in the low lanes and V2 in the
11031 } else if (Mask[2] < 4 && Mask[3] < 4) {
11032 // We also handle the reversed case because this utility may get called
11033 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
11034 // arrange things in the right direction.
11040 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
11041 // trying to place elements directly, just blend them and set up the final
11042 // shuffle to place them.
11044 // The first two blend mask elements are for V1, the second two are for
11046 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
11047 Mask[2] < 4 ? Mask[2] : Mask[3],
11048 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
11049 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
11050 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11051 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11053 // Now we do a normal shuffle of V1 by giving V1 as both operands to
11056 NewMask[0] = Mask[0] < 4 ? 0 : 2;
11057 NewMask[1] = Mask[0] < 4 ? 2 : 0;
11058 NewMask[2] = Mask[2] < 4 ? 1 : 3;
11059 NewMask[3] = Mask[2] < 4 ? 3 : 1;
11062 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
11063 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
11066 /// \brief Lower 4-lane 32-bit floating point shuffles.
11068 /// Uses instructions exclusively from the floating point unit to minimize
11069 /// domain crossing penalties, as these are sufficient to implement all v4f32
11071 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11072 const APInt &Zeroable,
11073 SDValue V1, SDValue V2,
11074 const X86Subtarget &Subtarget,
11075 SelectionDAG &DAG) {
11076 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11077 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11078 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11080 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11082 if (NumV2Elements == 0) {
11083 // Check for being able to broadcast a single element.
11084 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11085 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
11088 // Use even/odd duplicate instructions for masks that match their pattern.
11089 if (Subtarget.hasSSE3()) {
11090 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11091 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
11092 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
11093 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
11096 if (Subtarget.hasAVX()) {
11097 // If we have AVX, we can use VPERMILPS which will allow folding a load
11098 // into the shuffle.
11099 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
11100 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11103 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
11104 // in SSE1 because otherwise they are widened to v2f64 and never get here.
11105 if (!Subtarget.hasSSE2()) {
11106 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
11107 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
11108 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
11109 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
11112 // Otherwise, use a straight shuffle of a single input vector. We pass the
11113 // input vector to both operands to simulate this with a SHUFPS.
11114 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
11115 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11118 // There are special ways we can lower some single-element blends. However, we
11119 // have custom ways we can lower more complex single-element blends below that
11120 // we defer to if both this and BLENDPS fail to match, so restrict this to
11121 // when the V2 input is targeting element 0 of the mask -- that is the fast
11123 if (NumV2Elements == 1 && Mask[0] >= 4)
11124 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11125 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11128 if (Subtarget.hasSSE41()) {
11129 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
11130 Zeroable, Subtarget, DAG))
11133 // Use INSERTPS if we can complete the shuffle efficiently.
11135 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
11138 if (!isSingleSHUFPSMask(Mask))
11139 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
11140 DL, MVT::v4f32, V1, V2, Mask, DAG))
11144 // Use low/high mov instructions. These are only valid in SSE1 because
11145 // otherwise they are widened to v2f64 and never get here.
11146 if (!Subtarget.hasSSE2()) {
11147 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
11148 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
11149 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
11150 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
11153 // Use dedicated unpack instructions for masks that match their pattern.
11155 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
11158 // Otherwise fall back to a SHUFPS lowering strategy.
11159 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
11162 /// \brief Lower 4-lane i32 vector shuffles.
11164 /// We try to handle these with integer-domain shuffles where we can, but for
11165 /// blends we use the floating point domain blend instructions.
11166 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11167 const APInt &Zeroable,
11168 SDValue V1, SDValue V2,
11169 const X86Subtarget &Subtarget,
11170 SelectionDAG &DAG) {
11171 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11172 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11173 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11175 // Whenever we can lower this as a zext, that instruction is strictly faster
11176 // than any alternative. It also allows us to fold memory operands into the
11177 // shuffle in many cases.
11178 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11179 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11182 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11184 if (NumV2Elements == 0) {
11185 // Check for being able to broadcast a single element.
11186 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11187 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11190 // Straight shuffle of a single input vector. For everything from SSE2
11191 // onward this has a single fast instruction with no scary immediates.
11192 // We coerce the shuffle pattern to be compatible with UNPCK instructions
11193 // but we aren't actually going to use the UNPCK instruction because doing
11194 // so prevents folding a load into this instruction or making a copy.
11195 const int UnpackLoMask[] = {0, 0, 1, 1};
11196 const int UnpackHiMask[] = {2, 2, 3, 3};
11197 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
11198 Mask = UnpackLoMask;
11199 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
11200 Mask = UnpackHiMask;
11202 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11203 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11206 // Try to use shift instructions.
11207 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
11208 Zeroable, Subtarget, DAG))
11211 // There are special ways we can lower some single-element blends.
11212 if (NumV2Elements == 1)
11213 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11214 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11217 // We have different paths for blend lowering, but they all must use the
11218 // *exact* same predicate.
11219 bool IsBlendSupported = Subtarget.hasSSE41();
11220 if (IsBlendSupported)
11221 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
11222 Zeroable, Subtarget, DAG))
11225 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
11229 // Use dedicated unpack instructions for masks that match their pattern.
11231 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
11234 // Try to use byte rotation instructions.
11235 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11236 if (Subtarget.hasSSSE3()) {
11237 if (Subtarget.hasVLX())
11238 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
11239 Mask, Subtarget, DAG))
11242 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11243 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11247 // Assume that a single SHUFPS is faster than an alternative sequence of
11248 // multiple instructions (even if the CPU has a domain penalty).
11249 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
11250 if (!isSingleSHUFPSMask(Mask)) {
11251 // If we have direct support for blends, we should lower by decomposing into
11252 // a permute. That will be faster than the domain cross.
11253 if (IsBlendSupported)
11254 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
11257 // Try to lower by permuting the inputs into an unpack instruction.
11258 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11259 DL, MVT::v4i32, V1, V2, Mask, DAG))
11263 // We implement this with SHUFPS because it can blend from two vectors.
11264 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
11265 // up the inputs, bypassing domain shift penalties that we would incur if we
11266 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
11268 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
11269 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
11270 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
11271 return DAG.getBitcast(MVT::v4i32, ShufPS);
11274 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
11275 /// shuffle lowering, and the most complex part.
11277 /// The lowering strategy is to try to form pairs of input lanes which are
11278 /// targeted at the same half of the final vector, and then use a dword shuffle
11279 /// to place them onto the right half, and finally unpack the paired lanes into
11280 /// their final position.
11282 /// The exact breakdown of how to form these dword pairs and align them on the
11283 /// correct sides is really tricky. See the comments within the function for
11284 /// more of the details.
11286 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
11287 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
11288 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
11289 /// vector, form the analogous 128-bit 8-element Mask.
11290 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
11291 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
11292 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11293 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
11294 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
11296 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
11297 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
11298 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
11300 // Attempt to directly match PSHUFLW or PSHUFHW.
11301 if (isUndefOrInRange(LoMask, 0, 4) &&
11302 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
11303 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11304 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11306 if (isUndefOrInRange(HiMask, 4, 8) &&
11307 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
11308 for (int i = 0; i != 4; ++i)
11309 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
11310 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11311 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11314 SmallVector<int, 4> LoInputs;
11315 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
11316 std::sort(LoInputs.begin(), LoInputs.end());
11317 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
11318 SmallVector<int, 4> HiInputs;
11319 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
11320 std::sort(HiInputs.begin(), HiInputs.end());
11321 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
11323 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
11324 int NumHToL = LoInputs.size() - NumLToL;
11326 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
11327 int NumHToH = HiInputs.size() - NumLToH;
11328 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
11329 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
11330 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
11331 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
11333 // If we are shuffling values from one half - check how many different DWORD
11334 // pairs we need to create. If only 1 or 2 then we can perform this as a
11335 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
11336 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
11337 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
11338 V = DAG.getNode(ShufWOp, DL, VT, V,
11339 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11340 V = DAG.getBitcast(PSHUFDVT, V);
11341 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
11342 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11343 return DAG.getBitcast(VT, V);
11346 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
11347 int PSHUFDMask[4] = { -1, -1, -1, -1 };
11348 SmallVector<std::pair<int, int>, 4> DWordPairs;
11349 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
11351 // Collect the different DWORD pairs.
11352 for (int DWord = 0; DWord != 4; ++DWord) {
11353 int M0 = Mask[2 * DWord + 0];
11354 int M1 = Mask[2 * DWord + 1];
11355 M0 = (M0 >= 0 ? M0 % 4 : M0);
11356 M1 = (M1 >= 0 ? M1 % 4 : M1);
11357 if (M0 < 0 && M1 < 0)
11360 bool Match = false;
11361 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
11362 auto &DWordPair = DWordPairs[j];
11363 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
11364 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
11365 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
11366 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
11367 PSHUFDMask[DWord] = DOffset + j;
11373 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
11374 DWordPairs.push_back(std::make_pair(M0, M1));
11378 if (DWordPairs.size() <= 2) {
11379 DWordPairs.resize(2, std::make_pair(-1, -1));
11380 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
11381 DWordPairs[1].first, DWordPairs[1].second};
11382 if ((NumHToL + NumHToH) == 0)
11383 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
11384 if ((NumLToL + NumLToH) == 0)
11385 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
11389 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
11390 // such inputs we can swap two of the dwords across the half mark and end up
11391 // with <=2 inputs to each half in each half. Once there, we can fall through
11392 // to the generic code below. For example:
11394 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11395 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
11397 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
11398 // and an existing 2-into-2 on the other half. In this case we may have to
11399 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
11400 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
11401 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
11402 // because any other situation (including a 3-into-1 or 1-into-3 in the other
11403 // half than the one we target for fixing) will be fixed when we re-enter this
11404 // path. We will also combine away any sequence of PSHUFD instructions that
11405 // result into a single instruction. Here is an example of the tricky case:
11407 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11408 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
11410 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
11412 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
11413 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
11415 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
11416 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
11418 // The result is fine to be handled by the generic logic.
11419 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
11420 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
11421 int AOffset, int BOffset) {
11422 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
11423 "Must call this with A having 3 or 1 inputs from the A half.");
11424 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
11425 "Must call this with B having 1 or 3 inputs from the B half.");
11426 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
11427 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
11429 bool ThreeAInputs = AToAInputs.size() == 3;
11431 // Compute the index of dword with only one word among the three inputs in
11432 // a half by taking the sum of the half with three inputs and subtracting
11433 // the sum of the actual three inputs. The difference is the remaining
11435 int ADWord, BDWord;
11436 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11437 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11438 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11439 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11440 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11441 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11442 int TripleNonInputIdx =
11443 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11444 TripleDWord = TripleNonInputIdx / 2;
11446 // We use xor with one to compute the adjacent DWord to whichever one the
11448 OneInputDWord = (OneInput / 2) ^ 1;
11450 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11451 // and BToA inputs. If there is also such a problem with the BToB and AToB
11452 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11453 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11454 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11455 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11456 // Compute how many inputs will be flipped by swapping these DWords. We
11458 // to balance this to ensure we don't form a 3-1 shuffle in the other
11460 int NumFlippedAToBInputs =
11461 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11462 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11463 int NumFlippedBToBInputs =
11464 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11465 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11466 if ((NumFlippedAToBInputs == 1 &&
11467 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11468 (NumFlippedBToBInputs == 1 &&
11469 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11470 // We choose whether to fix the A half or B half based on whether that
11471 // half has zero flipped inputs. At zero, we may not be able to fix it
11472 // with that half. We also bias towards fixing the B half because that
11473 // will more commonly be the high half, and we have to bias one way.
11474 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11475 ArrayRef<int> Inputs) {
11476 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11477 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11478 // Determine whether the free index is in the flipped dword or the
11479 // unflipped dword based on where the pinned index is. We use this bit
11480 // in an xor to conditionally select the adjacent dword.
11481 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11482 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11483 if (IsFixIdxInput == IsFixFreeIdxInput)
11485 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11486 assert(IsFixIdxInput != IsFixFreeIdxInput &&
11487 "We need to be changing the number of flipped inputs!");
11488 int PSHUFHalfMask[] = {0, 1, 2, 3};
11489 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11491 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11492 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11493 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11495 for (int &M : Mask)
11496 if (M >= 0 && M == FixIdx)
11498 else if (M >= 0 && M == FixFreeIdx)
11501 if (NumFlippedBToBInputs != 0) {
11503 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11504 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11506 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
11507 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11508 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11513 int PSHUFDMask[] = {0, 1, 2, 3};
11514 PSHUFDMask[ADWord] = BDWord;
11515 PSHUFDMask[BDWord] = ADWord;
11516 V = DAG.getBitcast(
11518 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11519 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11521 // Adjust the mask to match the new locations of A and B.
11522 for (int &M : Mask)
11523 if (M >= 0 && M/2 == ADWord)
11524 M = 2 * BDWord + M % 2;
11525 else if (M >= 0 && M/2 == BDWord)
11526 M = 2 * ADWord + M % 2;
11528 // Recurse back into this routine to re-compute state now that this isn't
11529 // a 3 and 1 problem.
11530 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11533 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
11534 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11535 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
11536 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11538 // At this point there are at most two inputs to the low and high halves from
11539 // each half. That means the inputs can always be grouped into dwords and
11540 // those dwords can then be moved to the correct half with a dword shuffle.
11541 // We use at most one low and one high word shuffle to collect these paired
11542 // inputs into dwords, and finally a dword shuffle to place them.
11543 int PSHUFLMask[4] = {-1, -1, -1, -1};
11544 int PSHUFHMask[4] = {-1, -1, -1, -1};
11545 int PSHUFDMask[4] = {-1, -1, -1, -1};
11547 // First fix the masks for all the inputs that are staying in their
11548 // original halves. This will then dictate the targets of the cross-half
11550 auto fixInPlaceInputs =
11551 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11552 MutableArrayRef<int> SourceHalfMask,
11553 MutableArrayRef<int> HalfMask, int HalfOffset) {
11554 if (InPlaceInputs.empty())
11556 if (InPlaceInputs.size() == 1) {
11557 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11558 InPlaceInputs[0] - HalfOffset;
11559 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11562 if (IncomingInputs.empty()) {
11563 // Just fix all of the in place inputs.
11564 for (int Input : InPlaceInputs) {
11565 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11566 PSHUFDMask[Input / 2] = Input / 2;
11571 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11572 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11573 InPlaceInputs[0] - HalfOffset;
11574 // Put the second input next to the first so that they are packed into
11575 // a dword. We find the adjacent index by toggling the low bit.
11576 int AdjIndex = InPlaceInputs[0] ^ 1;
11577 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11578 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11579 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11581 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11582 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11584 // Now gather the cross-half inputs and place them into a free dword of
11585 // their target half.
11586 // FIXME: This operation could almost certainly be simplified dramatically to
11587 // look more like the 3-1 fixing operation.
11588 auto moveInputsToRightHalf = [&PSHUFDMask](
11589 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11590 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11591 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11593 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11594 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11596 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11598 int LowWord = Word & ~1;
11599 int HighWord = Word | 1;
11600 return isWordClobbered(SourceHalfMask, LowWord) ||
11601 isWordClobbered(SourceHalfMask, HighWord);
11604 if (IncomingInputs.empty())
11607 if (ExistingInputs.empty()) {
11608 // Map any dwords with inputs from them into the right half.
11609 for (int Input : IncomingInputs) {
11610 // If the source half mask maps over the inputs, turn those into
11611 // swaps and use the swapped lane.
11612 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11613 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11614 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11615 Input - SourceOffset;
11616 // We have to swap the uses in our half mask in one sweep.
11617 for (int &M : HalfMask)
11618 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11620 else if (M == Input)
11621 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11623 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11624 Input - SourceOffset &&
11625 "Previous placement doesn't match!");
11627 // Note that this correctly re-maps both when we do a swap and when
11628 // we observe the other side of the swap above. We rely on that to
11629 // avoid swapping the members of the input list directly.
11630 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11633 // Map the input's dword into the correct half.
11634 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11635 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11637 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11639 "Previous placement doesn't match!");
11642 // And just directly shift any other-half mask elements to be same-half
11643 // as we will have mirrored the dword containing the element into the
11644 // same position within that half.
11645 for (int &M : HalfMask)
11646 if (M >= SourceOffset && M < SourceOffset + 4) {
11647 M = M - SourceOffset + DestOffset;
11648 assert(M >= 0 && "This should never wrap below zero!");
11653 // Ensure we have the input in a viable dword of its current half. This
11654 // is particularly tricky because the original position may be clobbered
11655 // by inputs being moved and *staying* in that half.
11656 if (IncomingInputs.size() == 1) {
11657 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11658 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11660 SourceHalfMask[InputFixed - SourceOffset] =
11661 IncomingInputs[0] - SourceOffset;
11662 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11664 IncomingInputs[0] = InputFixed;
11666 } else if (IncomingInputs.size() == 2) {
11667 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11668 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11669 // We have two non-adjacent or clobbered inputs we need to extract from
11670 // the source half. To do this, we need to map them into some adjacent
11671 // dword slot in the source mask.
11672 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11673 IncomingInputs[1] - SourceOffset};
11675 // If there is a free slot in the source half mask adjacent to one of
11676 // the inputs, place the other input in it. We use (Index XOR 1) to
11677 // compute an adjacent index.
11678 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11679 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11680 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11681 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11682 InputsFixed[1] = InputsFixed[0] ^ 1;
11683 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11684 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11685 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11686 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11687 InputsFixed[0] = InputsFixed[1] ^ 1;
11688 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11689 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11690 // The two inputs are in the same DWord but it is clobbered and the
11691 // adjacent DWord isn't used at all. Move both inputs to the free
11693 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11694 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11695 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11696 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11698 // The only way we hit this point is if there is no clobbering
11699 // (because there are no off-half inputs to this half) and there is no
11700 // free slot adjacent to one of the inputs. In this case, we have to
11701 // swap an input with a non-input.
11702 for (int i = 0; i < 4; ++i)
11703 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11704 "We can't handle any clobbers here!");
11705 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11706 "Cannot have adjacent inputs here!");
11708 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11709 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11711 // We also have to update the final source mask in this case because
11712 // it may need to undo the above swap.
11713 for (int &M : FinalSourceHalfMask)
11714 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11715 M = InputsFixed[1] + SourceOffset;
11716 else if (M == InputsFixed[1] + SourceOffset)
11717 M = (InputsFixed[0] ^ 1) + SourceOffset;
11719 InputsFixed[1] = InputsFixed[0] ^ 1;
11722 // Point everything at the fixed inputs.
11723 for (int &M : HalfMask)
11724 if (M == IncomingInputs[0])
11725 M = InputsFixed[0] + SourceOffset;
11726 else if (M == IncomingInputs[1])
11727 M = InputsFixed[1] + SourceOffset;
11729 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11730 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11733 llvm_unreachable("Unhandled input size!");
11736 // Now hoist the DWord down to the right half.
11737 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11738 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11739 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11740 for (int &M : HalfMask)
11741 for (int Input : IncomingInputs)
11743 M = FreeDWord * 2 + Input % 2;
11745 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11746 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11747 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11748 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11750 // Now enact all the shuffles we've computed to move the inputs into their
11752 if (!isNoopShuffleMask(PSHUFLMask))
11753 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11754 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11755 if (!isNoopShuffleMask(PSHUFHMask))
11756 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11757 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11758 if (!isNoopShuffleMask(PSHUFDMask))
11759 V = DAG.getBitcast(
11761 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11762 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11764 // At this point, each half should contain all its inputs, and we can then
11765 // just shuffle them into their final position.
11766 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11767 "Failed to lift all the high half inputs to the low mask!");
11768 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11769 "Failed to lift all the low half inputs to the high mask!");
11771 // Do a half shuffle for the low mask.
11772 if (!isNoopShuffleMask(LoMask))
11773 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11774 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11776 // Do a half shuffle with the high mask after shifting its values down.
11777 for (int &M : HiMask)
11780 if (!isNoopShuffleMask(HiMask))
11781 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11782 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11787 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11788 /// blend if only one input is used.
11789 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11790 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11791 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11793 SDValue V1Mask[16];
11794 SDValue V2Mask[16];
11798 int Size = Mask.size();
11799 int Scale = 16 / Size;
11800 for (int i = 0; i < 16; ++i) {
11801 if (Mask[i / Scale] < 0) {
11802 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11804 const int ZeroMask = 0x80;
11805 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11807 int V2Idx = Mask[i / Scale] < Size
11809 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11810 if (Zeroable[i / Scale])
11811 V1Idx = V2Idx = ZeroMask;
11812 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11813 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11814 V1InUse |= (ZeroMask != V1Idx);
11815 V2InUse |= (ZeroMask != V2Idx);
11820 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11821 DAG.getBitcast(MVT::v16i8, V1),
11822 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11824 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11825 DAG.getBitcast(MVT::v16i8, V2),
11826 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11828 // If we need shuffled inputs from both, blend the two.
11830 if (V1InUse && V2InUse)
11831 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11833 V = V1InUse ? V1 : V2;
11835 // Cast the result back to the correct type.
11836 return DAG.getBitcast(VT, V);
11839 /// \brief Generic lowering of 8-lane i16 shuffles.
11841 /// This handles both single-input shuffles and combined shuffle/blends with
11842 /// two inputs. The single input shuffles are immediately delegated to
11843 /// a dedicated lowering routine.
11845 /// The blends are lowered in one of three fundamental ways. If there are few
11846 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11847 /// of the input is significantly cheaper when lowered as an interleaving of
11848 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11849 /// halves of the inputs separately (making them have relatively few inputs)
11850 /// and then concatenate them.
11851 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11852 const APInt &Zeroable,
11853 SDValue V1, SDValue V2,
11854 const X86Subtarget &Subtarget,
11855 SelectionDAG &DAG) {
11856 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11857 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11858 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11860 // Whenever we can lower this as a zext, that instruction is strictly faster
11861 // than any alternative.
11862 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11863 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11866 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11868 if (NumV2Inputs == 0) {
11869 // Check for being able to broadcast a single element.
11870 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11871 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11874 // Try to use shift instructions.
11875 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11876 Zeroable, Subtarget, DAG))
11879 // Use dedicated unpack instructions for masks that match their pattern.
11881 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11884 // Use dedicated pack instructions for masks that match their pattern.
11885 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
11889 // Try to use byte rotation instructions.
11890 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11891 Mask, Subtarget, DAG))
11894 // Make a copy of the mask so it can be modified.
11895 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11896 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11897 MutableMask, Subtarget,
11901 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11902 "All single-input shuffles should be canonicalized to be V1-input "
11905 // Try to use shift instructions.
11906 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11907 Zeroable, Subtarget, DAG))
11910 // See if we can use SSE4A Extraction / Insertion.
11911 if (Subtarget.hasSSE4A())
11912 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11916 // There are special ways we can lower some single-element blends.
11917 if (NumV2Inputs == 1)
11918 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11919 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11922 // We have different paths for blend lowering, but they all must use the
11923 // *exact* same predicate.
11924 bool IsBlendSupported = Subtarget.hasSSE41();
11925 if (IsBlendSupported)
11926 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11927 Zeroable, Subtarget, DAG))
11930 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11934 // Use dedicated unpack instructions for masks that match their pattern.
11936 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11939 // Use dedicated pack instructions for masks that match their pattern.
11940 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
11944 // Try to use byte rotation instructions.
11945 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11946 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11949 if (SDValue BitBlend =
11950 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11953 // Try to lower by permuting the inputs into an unpack instruction.
11954 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11958 // If we can't directly blend but can use PSHUFB, that will be better as it
11959 // can both shuffle and set up the inefficient blend.
11960 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11961 bool V1InUse, V2InUse;
11962 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11963 Zeroable, DAG, V1InUse, V2InUse);
11966 // We can always bit-blend if we have to so the fallback strategy is to
11967 // decompose into single-input permutes and blends.
11968 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11972 /// \brief Check whether a compaction lowering can be done by dropping even
11973 /// elements and compute how many times even elements must be dropped.
11975 /// This handles shuffles which take every Nth element where N is a power of
11976 /// two. Example shuffle masks:
11978 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11979 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11980 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11981 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11982 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11983 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11985 /// Any of these lanes can of course be undef.
11987 /// This routine only supports N <= 3.
11988 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11991 /// \returns N above, or the number of times even elements must be dropped if
11992 /// there is such a number. Otherwise returns zero.
11993 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11994 bool IsSingleInput) {
11995 // The modulus for the shuffle vector entries is based on whether this is
11996 // a single input or not.
11997 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11998 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11999 "We should only be called with masks with a power-of-2 size!");
12001 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12003 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12004 // and 2^3 simultaneously. This is because we may have ambiguity with
12005 // partially undef inputs.
12006 bool ViableForN[3] = {true, true, true};
12008 for (int i = 0, e = Mask.size(); i < e; ++i) {
12009 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12014 bool IsAnyViable = false;
12015 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12016 if (ViableForN[j]) {
12017 uint64_t N = j + 1;
12019 // The shuffle mask must be equal to (i * 2^N) % M.
12020 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
12021 IsAnyViable = true;
12023 ViableForN[j] = false;
12025 // Early exit if we exhaust the possible powers of two.
12030 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12034 // Return 0 as there is no viable power of two.
12038 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12039 ArrayRef<int> Mask, SDValue V1,
12040 SDValue V2, SelectionDAG &DAG) {
12041 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12042 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12044 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12046 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12048 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12051 /// \brief Generic lowering of v16i8 shuffles.
12053 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
12054 /// detect any complexity reducing interleaving. If that doesn't help, it uses
12055 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
12056 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
12058 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12059 const APInt &Zeroable,
12060 SDValue V1, SDValue V2,
12061 const X86Subtarget &Subtarget,
12062 SelectionDAG &DAG) {
12063 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12064 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12065 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12067 // Try to use shift instructions.
12068 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
12069 Zeroable, Subtarget, DAG))
12072 // Try to use byte rotation instructions.
12073 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12074 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12077 // Use dedicated pack instructions for masks that match their pattern.
12078 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
12082 // Try to use a zext lowering.
12083 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12084 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12087 // See if we can use SSE4A Extraction / Insertion.
12088 if (Subtarget.hasSSE4A())
12089 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
12093 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
12095 // For single-input shuffles, there are some nicer lowering tricks we can use.
12096 if (NumV2Elements == 0) {
12097 // Check for being able to broadcast a single element.
12098 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12099 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12102 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
12103 // Notably, this handles splat and partial-splat shuffles more efficiently.
12104 // However, it only makes sense if the pre-duplication shuffle simplifies
12105 // things significantly. Currently, this means we need to be able to
12106 // express the pre-duplication shuffle as an i16 shuffle.
12108 // FIXME: We should check for other patterns which can be widened into an
12109 // i16 shuffle as well.
12110 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
12111 for (int i = 0; i < 16; i += 2)
12112 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
12117 auto tryToWidenViaDuplication = [&]() -> SDValue {
12118 if (!canWidenViaDuplication(Mask))
12120 SmallVector<int, 4> LoInputs;
12121 copy_if(Mask, std::back_inserter(LoInputs),
12122 [](int M) { return M >= 0 && M < 8; });
12123 std::sort(LoInputs.begin(), LoInputs.end());
12124 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
12126 SmallVector<int, 4> HiInputs;
12127 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
12128 std::sort(HiInputs.begin(), HiInputs.end());
12129 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
12132 bool TargetLo = LoInputs.size() >= HiInputs.size();
12133 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
12134 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
12136 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
12137 SmallDenseMap<int, int, 8> LaneMap;
12138 for (int I : InPlaceInputs) {
12139 PreDupI16Shuffle[I/2] = I/2;
12142 int j = TargetLo ? 0 : 4, je = j + 4;
12143 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
12144 // Check if j is already a shuffle of this input. This happens when
12145 // there are two adjacent bytes after we move the low one.
12146 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
12147 // If we haven't yet mapped the input, search for a slot into which
12149 while (j < je && PreDupI16Shuffle[j] >= 0)
12153 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
12156 // Map this input with the i16 shuffle.
12157 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
12160 // Update the lane map based on the mapping we ended up with.
12161 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
12163 V1 = DAG.getBitcast(
12165 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12166 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
12168 // Unpack the bytes to form the i16s that will be shuffled into place.
12169 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12170 MVT::v16i8, V1, V1);
12172 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
12173 for (int i = 0; i < 16; ++i)
12174 if (Mask[i] >= 0) {
12175 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
12176 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
12177 if (PostDupI16Shuffle[i / 2] < 0)
12178 PostDupI16Shuffle[i / 2] = MappedMask;
12180 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
12181 "Conflicting entries in the original shuffle!");
12183 return DAG.getBitcast(
12185 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12186 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
12188 if (SDValue V = tryToWidenViaDuplication())
12192 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
12196 // Use dedicated unpack instructions for masks that match their pattern.
12198 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
12201 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
12202 // with PSHUFB. It is important to do this before we attempt to generate any
12203 // blends but after all of the single-input lowerings. If the single input
12204 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12205 // want to preserve that and we can DAG combine any longer sequences into
12206 // a PSHUFB in the end. But once we start blending from multiple inputs,
12207 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12208 // and there are *very* few patterns that would actually be faster than the
12209 // PSHUFB approach because of its ability to zero lanes.
12211 // FIXME: The only exceptions to the above are blends which are exact
12212 // interleavings with direct instructions supporting them. We currently don't
12213 // handle those well here.
12214 if (Subtarget.hasSSSE3()) {
12215 bool V1InUse = false;
12216 bool V2InUse = false;
12218 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12219 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
12221 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
12222 // do so. This avoids using them to handle blends-with-zero which is
12223 // important as a single pshufb is significantly faster for that.
12224 if (V1InUse && V2InUse) {
12225 if (Subtarget.hasSSE41())
12226 if (SDValue Blend = lowerVectorShuffleAsBlend(
12227 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12230 // We can use an unpack to do the blending rather than an or in some
12231 // cases. Even though the or may be (very minorly) more efficient, we
12232 // preference this lowering because there are common cases where part of
12233 // the complexity of the shuffles goes away when we do the final blend as
12235 // FIXME: It might be worth trying to detect if the unpack-feeding
12236 // shuffles will both be pshufb, in which case we shouldn't bother with
12238 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12239 DL, MVT::v16i8, V1, V2, Mask, DAG))
12242 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
12243 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
12244 return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
12250 // There are special ways we can lower some single-element blends.
12251 if (NumV2Elements == 1)
12252 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12253 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12256 if (SDValue BitBlend =
12257 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
12260 // Check whether a compaction lowering can be done. This handles shuffles
12261 // which take every Nth element for some even N. See the helper function for
12264 // We special case these as they can be particularly efficiently handled with
12265 // the PACKUSB instruction on x86 and they show up in common patterns of
12266 // rearranging bytes to truncate wide elements.
12267 bool IsSingleInput = V2.isUndef();
12268 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
12269 // NumEvenDrops is the power of two stride of the elements. Another way of
12270 // thinking about it is that we need to drop the even elements this many
12271 // times to get the original input.
12273 // First we need to zero all the dropped bytes.
12274 assert(NumEvenDrops <= 3 &&
12275 "No support for dropping even elements more than 3 times.");
12276 // We use the mask type to pick which bytes are preserved based on how many
12277 // elements are dropped.
12278 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
12279 SDValue ByteClearMask = DAG.getBitcast(
12280 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
12281 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
12282 if (!IsSingleInput)
12283 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
12285 // Now pack things back together.
12286 V1 = DAG.getBitcast(MVT::v8i16, V1);
12287 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
12288 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
12289 for (int i = 1; i < NumEvenDrops; ++i) {
12290 Result = DAG.getBitcast(MVT::v8i16, Result);
12291 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
12297 // Handle multi-input cases by blending single-input shuffles.
12298 if (NumV2Elements > 0)
12299 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
12302 // The fallback path for single-input shuffles widens this into two v8i16
12303 // vectors with unpacks, shuffles those, and then pulls them back together
12307 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12308 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12309 for (int i = 0; i < 16; ++i)
12311 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
12313 SDValue VLoHalf, VHiHalf;
12314 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
12315 // them out and avoid using UNPCK{L,H} to extract the elements of V as
12317 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
12318 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
12319 // Use a mask to drop the high bytes.
12320 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
12321 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
12322 DAG.getConstant(0x00FF, DL, MVT::v8i16));
12324 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
12325 VHiHalf = DAG.getUNDEF(MVT::v8i16);
12327 // Squash the masks to point directly into VLoHalf.
12328 for (int &M : LoBlendMask)
12331 for (int &M : HiBlendMask)
12335 // Otherwise just unpack the low half of V into VLoHalf and the high half into
12336 // VHiHalf so that we can blend them as i16s.
12337 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
12339 VLoHalf = DAG.getBitcast(
12340 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
12341 VHiHalf = DAG.getBitcast(
12342 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
12345 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
12346 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
12348 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
12351 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
12353 /// This routine breaks down the specific type of 128-bit shuffle and
12354 /// dispatches to the lowering routines accordingly.
12355 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12356 MVT VT, SDValue V1, SDValue V2,
12357 const APInt &Zeroable,
12358 const X86Subtarget &Subtarget,
12359 SelectionDAG &DAG) {
12360 switch (VT.SimpleTy) {
12362 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12364 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12366 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12368 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12370 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12372 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12375 llvm_unreachable("Unimplemented!");
12379 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
12381 /// This routine just extracts two subvectors, shuffles them independently, and
12382 /// then concatenates them back together. This should work effectively with all
12383 /// AVX vector shuffle types.
12384 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12385 SDValue V2, ArrayRef<int> Mask,
12386 SelectionDAG &DAG) {
12387 assert(VT.getSizeInBits() >= 256 &&
12388 "Only for 256-bit or wider vector shuffles!");
12389 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
12390 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
12392 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
12393 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
12395 int NumElements = VT.getVectorNumElements();
12396 int SplitNumElements = NumElements / 2;
12397 MVT ScalarVT = VT.getVectorElementType();
12398 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
12400 // Rather than splitting build-vectors, just build two narrower build
12401 // vectors. This helps shuffling with splats and zeros.
12402 auto SplitVector = [&](SDValue V) {
12403 V = peekThroughBitcasts(V);
12405 MVT OrigVT = V.getSimpleValueType();
12406 int OrigNumElements = OrigVT.getVectorNumElements();
12407 int OrigSplitNumElements = OrigNumElements / 2;
12408 MVT OrigScalarVT = OrigVT.getVectorElementType();
12409 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
12413 auto *BV = dyn_cast<BuildVectorSDNode>(V);
12415 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12416 DAG.getIntPtrConstant(0, DL));
12417 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12418 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
12421 SmallVector<SDValue, 16> LoOps, HiOps;
12422 for (int i = 0; i < OrigSplitNumElements; ++i) {
12423 LoOps.push_back(BV->getOperand(i));
12424 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
12426 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
12427 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
12429 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
12430 DAG.getBitcast(SplitVT, HiV));
12433 SDValue LoV1, HiV1, LoV2, HiV2;
12434 std::tie(LoV1, HiV1) = SplitVector(V1);
12435 std::tie(LoV2, HiV2) = SplitVector(V2);
12437 // Now create two 4-way blends of these half-width vectors.
12438 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
12439 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
12440 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
12441 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
12442 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
12443 for (int i = 0; i < SplitNumElements; ++i) {
12444 int M = HalfMask[i];
12445 if (M >= NumElements) {
12446 if (M >= NumElements + SplitNumElements)
12450 V2BlendMask[i] = M - NumElements;
12451 BlendMask[i] = SplitNumElements + i;
12452 } else if (M >= 0) {
12453 if (M >= SplitNumElements)
12457 V1BlendMask[i] = M;
12462 // Because the lowering happens after all combining takes place, we need to
12463 // manually combine these blend masks as much as possible so that we create
12464 // a minimal number of high-level vector shuffle nodes.
12466 // First try just blending the halves of V1 or V2.
12467 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
12468 return DAG.getUNDEF(SplitVT);
12469 if (!UseLoV2 && !UseHiV2)
12470 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12471 if (!UseLoV1 && !UseHiV1)
12472 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12474 SDValue V1Blend, V2Blend;
12475 if (UseLoV1 && UseHiV1) {
12477 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12479 // We only use half of V1 so map the usage down into the final blend mask.
12480 V1Blend = UseLoV1 ? LoV1 : HiV1;
12481 for (int i = 0; i < SplitNumElements; ++i)
12482 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12483 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12485 if (UseLoV2 && UseHiV2) {
12487 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12489 // We only use half of V2 so map the usage down into the final blend mask.
12490 V2Blend = UseLoV2 ? LoV2 : HiV2;
12491 for (int i = 0; i < SplitNumElements; ++i)
12492 if (BlendMask[i] >= SplitNumElements)
12493 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12495 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12497 SDValue Lo = HalfBlend(LoMask);
12498 SDValue Hi = HalfBlend(HiMask);
12499 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12502 /// \brief Either split a vector in halves or decompose the shuffles and the
12505 /// This is provided as a good fallback for many lowerings of non-single-input
12506 /// shuffles with more than one 128-bit lane. In those cases, we want to select
12507 /// between splitting the shuffle into 128-bit components and stitching those
12508 /// back together vs. extracting the single-input shuffles and blending those
12510 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12511 SDValue V1, SDValue V2,
12512 ArrayRef<int> Mask,
12513 SelectionDAG &DAG) {
12514 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
12515 "shuffles as it could then recurse on itself.");
12516 int Size = Mask.size();
12518 // If this can be modeled as a broadcast of two elements followed by a blend,
12519 // prefer that lowering. This is especially important because broadcasts can
12520 // often fold with memory operands.
12521 auto DoBothBroadcast = [&] {
12522 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
12525 if (V2BroadcastIdx < 0)
12526 V2BroadcastIdx = M - Size;
12527 else if (M - Size != V2BroadcastIdx)
12529 } else if (M >= 0) {
12530 if (V1BroadcastIdx < 0)
12531 V1BroadcastIdx = M;
12532 else if (M != V1BroadcastIdx)
12537 if (DoBothBroadcast())
12538 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12541 // If the inputs all stem from a single 128-bit lane of each input, then we
12542 // split them rather than blending because the split will decompose to
12543 // unusually few instructions.
12544 int LaneCount = VT.getSizeInBits() / 128;
12545 int LaneSize = Size / LaneCount;
12546 SmallBitVector LaneInputs[2];
12547 LaneInputs[0].resize(LaneCount, false);
12548 LaneInputs[1].resize(LaneCount, false);
12549 for (int i = 0; i < Size; ++i)
12551 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12552 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12553 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12555 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12556 // that the decomposed single-input shuffles don't end up here.
12557 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12560 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12561 /// a permutation and blend of those lanes.
12563 /// This essentially blends the out-of-lane inputs to each lane into the lane
12564 /// from a permuted copy of the vector. This lowering strategy results in four
12565 /// instructions in the worst case for a single-input cross lane shuffle which
12566 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
12567 /// of. Special cases for each particular shuffle pattern should be handled
12568 /// prior to trying this lowering.
12569 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12570 SDValue V1, SDValue V2,
12571 ArrayRef<int> Mask,
12573 const X86Subtarget &Subtarget) {
12574 // FIXME: This should probably be generalized for 512-bit vectors as well.
12575 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
12576 int Size = Mask.size();
12577 int LaneSize = Size / 2;
12579 // If there are only inputs from one 128-bit lane, splitting will in fact be
12580 // less expensive. The flags track whether the given lane contains an element
12581 // that crosses to another lane.
12582 if (!Subtarget.hasAVX2()) {
12583 bool LaneCrossing[2] = {false, false};
12584 for (int i = 0; i < Size; ++i)
12585 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12586 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12587 if (!LaneCrossing[0] || !LaneCrossing[1])
12588 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12590 bool LaneUsed[2] = {false, false};
12591 for (int i = 0; i < Size; ++i)
12593 LaneUsed[(Mask[i] / LaneSize)] = true;
12594 if (!LaneUsed[0] || !LaneUsed[1])
12595 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12598 assert(V2.isUndef() &&
12599 "This last part of this routine only works on single input shuffles");
12601 SmallVector<int, 32> FlippedBlendMask(Size);
12602 for (int i = 0; i < Size; ++i)
12603 FlippedBlendMask[i] =
12604 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12606 : Mask[i] % LaneSize +
12607 (i / LaneSize) * LaneSize + Size);
12609 // Flip the vector, and blend the results which should now be in-lane.
12610 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
12611 SDValue Flipped = DAG.getBitcast(PVT, V1);
12612 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
12614 Flipped = DAG.getBitcast(VT, Flipped);
12615 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12618 /// \brief Handle lowering 2-lane 128-bit shuffles.
12619 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12620 SDValue V2, ArrayRef<int> Mask,
12621 const APInt &Zeroable,
12622 const X86Subtarget &Subtarget,
12623 SelectionDAG &DAG) {
12624 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
12625 if (Subtarget.hasAVX2() && V2.isUndef())
12628 SmallVector<int, 4> WidenedMask;
12629 if (!canWidenShuffleElements(Mask, WidenedMask))
12632 // TODO: If minimizing size and one of the inputs is a zero vector and the
12633 // the zero vector has only one use, we could use a VPERM2X128 to save the
12634 // instruction bytes needed to explicitly generate the zero vector.
12636 // Blends are faster and handle all the non-lane-crossing cases.
12637 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12638 Zeroable, Subtarget, DAG))
12641 bool IsLowZero = (Zeroable & 0x3) == 0x3;
12642 bool IsHighZero = (Zeroable & 0xc) == 0xc;
12644 // If either input operand is a zero vector, use VPERM2X128 because its mask
12645 // allows us to replace the zero input with an implicit zero.
12646 if (!IsLowZero && !IsHighZero) {
12647 // Check for patterns which can be matched with a single insert of a 128-bit
12649 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12650 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12652 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12653 // this will likely become vinsertf128 which can't fold a 256-bit memop.
12654 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12655 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12656 VT.getVectorNumElements() / 2);
12657 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12658 DAG.getIntPtrConstant(0, DL));
12659 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12660 OnlyUsesV1 ? V1 : V2,
12661 DAG.getIntPtrConstant(0, DL));
12662 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12666 // Try to use SHUF128 if possible.
12667 if (Subtarget.hasVLX()) {
12668 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
12669 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
12670 ((WidenedMask[1] % 2) << 1);
12671 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
12672 DAG.getConstant(PermMask, DL, MVT::i8));
12677 // Otherwise form a 128-bit permutation. After accounting for undefs,
12678 // convert the 64-bit shuffle mask selection values into 128-bit
12679 // selection bits by dividing the indexes by 2 and shifting into positions
12680 // defined by a vperm2*128 instruction's immediate control byte.
12682 // The immediate permute control byte looks like this:
12683 // [1:0] - select 128 bits from sources for low half of destination
12685 // [3] - zero low half of destination
12686 // [5:4] - select 128 bits from sources for high half of destination
12688 // [7] - zero high half of destination
12690 assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
12692 unsigned PermMask = 0;
12693 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
12694 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
12696 // Check the immediate mask and replace unused sources with undef.
12697 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
12698 V1 = DAG.getUNDEF(VT);
12699 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
12700 V2 = DAG.getUNDEF(VT);
12702 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12703 DAG.getConstant(PermMask, DL, MVT::i8));
12706 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12707 /// shuffling each lane.
12709 /// This will only succeed when the result of fixing the 128-bit lanes results
12710 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12711 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12712 /// the lane crosses early and then use simpler shuffles within each lane.
12714 /// FIXME: It might be worthwhile at some point to support this without
12715 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12716 /// in x86 only floating point has interesting non-repeating shuffles, and even
12717 /// those are still *marginally* more expensive.
12718 static SDValue lowerVectorShuffleByMerging128BitLanes(
12719 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12720 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12721 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12723 int Size = Mask.size();
12724 int LaneSize = 128 / VT.getScalarSizeInBits();
12725 int NumLanes = Size / LaneSize;
12726 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12728 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12729 // check whether the in-128-bit lane shuffles share a repeating pattern.
12730 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12731 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12732 for (int i = 0; i < Size; ++i) {
12736 int j = i / LaneSize;
12738 if (Lanes[j] < 0) {
12739 // First entry we've seen for this lane.
12740 Lanes[j] = Mask[i] / LaneSize;
12741 } else if (Lanes[j] != Mask[i] / LaneSize) {
12742 // This doesn't match the lane selected previously!
12746 // Check that within each lane we have a consistent shuffle mask.
12747 int k = i % LaneSize;
12748 if (InLaneMask[k] < 0) {
12749 InLaneMask[k] = Mask[i] % LaneSize;
12750 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12751 // This doesn't fit a repeating in-lane mask.
12756 // First shuffle the lanes into place.
12757 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12758 VT.getSizeInBits() / 64);
12759 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12760 for (int i = 0; i < NumLanes; ++i)
12761 if (Lanes[i] >= 0) {
12762 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12763 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12766 V1 = DAG.getBitcast(LaneVT, V1);
12767 V2 = DAG.getBitcast(LaneVT, V2);
12768 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12770 // Cast it back to the type we actually want.
12771 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12773 // Now do a simple shuffle that isn't lane crossing.
12774 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12775 for (int i = 0; i < Size; ++i)
12777 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12778 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12779 "Must not introduce lane crosses at this point!");
12781 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12784 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
12785 /// This allows for fast cases such as subvector extraction/insertion
12786 /// or shuffling smaller vector types which can lower more efficiently.
12787 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12788 SDValue V1, SDValue V2,
12789 ArrayRef<int> Mask,
12790 const X86Subtarget &Subtarget,
12791 SelectionDAG &DAG) {
12792 assert((VT.is256BitVector() || VT.is512BitVector()) &&
12793 "Expected 256-bit or 512-bit vector");
12795 unsigned NumElts = VT.getVectorNumElements();
12796 unsigned HalfNumElts = NumElts / 2;
12797 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12799 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12800 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12801 if (!UndefLower && !UndefUpper)
12804 // Upper half is undef and lower half is whole upper subvector.
12805 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12807 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12808 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12809 DAG.getIntPtrConstant(HalfNumElts, DL));
12810 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12811 DAG.getIntPtrConstant(0, DL));
12814 // Lower half is undef and upper half is whole lower subvector.
12815 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12817 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12818 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12819 DAG.getIntPtrConstant(0, DL));
12820 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12821 DAG.getIntPtrConstant(HalfNumElts, DL));
12824 // If the shuffle only uses two of the four halves of the input operands,
12825 // then extract them and perform the 'half' shuffle at half width.
12826 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12827 int HalfIdx1 = -1, HalfIdx2 = -1;
12828 SmallVector<int, 8> HalfMask(HalfNumElts);
12829 unsigned Offset = UndefLower ? HalfNumElts : 0;
12830 for (unsigned i = 0; i != HalfNumElts; ++i) {
12831 int M = Mask[i + Offset];
12837 // Determine which of the 4 half vectors this element is from.
12838 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12839 int HalfIdx = M / HalfNumElts;
12841 // Determine the element index into its half vector source.
12842 int HalfElt = M % HalfNumElts;
12844 // We can shuffle with up to 2 half vectors, set the new 'half'
12845 // shuffle mask accordingly.
12846 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12847 HalfMask[i] = HalfElt;
12848 HalfIdx1 = HalfIdx;
12851 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12852 HalfMask[i] = HalfElt + HalfNumElts;
12853 HalfIdx2 = HalfIdx;
12857 // Too many half vectors referenced.
12860 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12862 // Only shuffle the halves of the inputs when useful.
12863 int NumLowerHalves =
12864 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12865 int NumUpperHalves =
12866 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12868 // uuuuXXXX - don't extract uppers just to insert again.
12869 if (UndefLower && NumUpperHalves != 0)
12872 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12873 if (UndefUpper && NumUpperHalves == 2)
12876 // AVX2 - XXXXuuuu - always extract lowers.
12877 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12878 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12879 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12881 // AVX2 supports variable 32-bit element cross-lane shuffles.
12882 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12883 // XXXXuuuu - don't extract lowers and uppers.
12884 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12889 // AVX512 - XXXXuuuu - always extract lowers.
12890 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
12893 auto GetHalfVector = [&](int HalfIdx) {
12895 return DAG.getUNDEF(HalfVT);
12896 SDValue V = (HalfIdx < 2 ? V1 : V2);
12897 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12898 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12899 DAG.getIntPtrConstant(HalfIdx, DL));
12902 SDValue Half1 = GetHalfVector(HalfIdx1);
12903 SDValue Half2 = GetHalfVector(HalfIdx2);
12904 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12905 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12906 DAG.getIntPtrConstant(Offset, DL));
12909 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12912 /// This returns true if the elements from a particular input are already in the
12913 /// slot required by the given mask and require no permutation.
12914 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12915 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12916 int Size = Mask.size();
12917 for (int i = 0; i < Size; ++i)
12918 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12924 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12925 /// every lane can be represented as the same repeating mask - allowing us to
12926 /// shuffle the sources with the repeating shuffle and then permute the result
12927 /// to the destination lanes.
12928 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12929 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12930 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12931 int NumElts = VT.getVectorNumElements();
12932 int NumLanes = VT.getSizeInBits() / 128;
12933 int NumLaneElts = NumElts / NumLanes;
12935 // On AVX2 we may be able to just shuffle the lowest elements and then
12936 // broadcast the result.
12937 if (Subtarget.hasAVX2()) {
12938 for (unsigned BroadcastSize : {16, 32, 64}) {
12939 if (BroadcastSize <= VT.getScalarSizeInBits())
12941 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12943 // Attempt to match a repeating pattern every NumBroadcastElts,
12944 // accounting for UNDEFs but only references the lowest 128-bit
12945 // lane of the inputs.
12946 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12947 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12948 for (int j = 0; j != NumBroadcastElts; ++j) {
12949 int M = Mask[i + j];
12952 int &R = RepeatMask[j];
12953 if (0 != ((M % NumElts) / NumLaneElts))
12955 if (0 <= R && R != M)
12962 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12963 if (!FindRepeatingBroadcastMask(RepeatMask))
12966 // Shuffle the (lowest) repeated elements in place for broadcast.
12967 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12969 // Shuffle the actual broadcast.
12970 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12971 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12972 for (int j = 0; j != NumBroadcastElts; ++j)
12973 BroadcastMask[i + j] = j;
12974 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12979 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12980 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12983 // Bail if we already have a repeated lane shuffle mask.
12984 SmallVector<int, 8> RepeatedShuffleMask;
12985 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12988 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12989 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12990 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12991 int NumSubLanes = NumLanes * SubLaneScale;
12992 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12994 // Check that all the sources are coming from the same lane and see if we can
12995 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12996 // determine the source sub-lane for each destination sub-lane.
12997 int TopSrcSubLane = -1;
12998 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12999 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
13000 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
13001 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
13003 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
13004 // Extract the sub-lane mask, check that it all comes from the same lane
13005 // and normalize the mask entries to come from the first lane.
13007 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
13008 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13009 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
13012 int Lane = (M % NumElts) / NumLaneElts;
13013 if ((0 <= SrcLane) && (SrcLane != Lane))
13016 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
13017 SubLaneMask[Elt] = LocalM;
13020 // Whole sub-lane is UNDEF.
13024 // Attempt to match against the candidate repeated sub-lane masks.
13025 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
13026 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
13027 for (int i = 0; i != NumSubLaneElts; ++i) {
13028 if (M1[i] < 0 || M2[i] < 0)
13030 if (M1[i] != M2[i])
13036 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
13037 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
13040 // Merge the sub-lane mask into the matching repeated sub-lane mask.
13041 for (int i = 0; i != NumSubLaneElts; ++i) {
13042 int M = SubLaneMask[i];
13045 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
13046 "Unexpected mask element");
13047 RepeatedSubLaneMask[i] = M;
13050 // Track the top most source sub-lane - by setting the remaining to UNDEF
13051 // we can greatly simplify shuffle matching.
13052 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
13053 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
13054 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
13058 // Bail if we failed to find a matching repeated sub-lane mask.
13059 if (Dst2SrcSubLanes[DstSubLane] < 0)
13062 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
13063 "Unexpected source lane");
13065 // Create a repeating shuffle mask for the entire vector.
13066 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
13067 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
13068 int Lane = SubLane / SubLaneScale;
13069 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
13070 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13071 int M = RepeatedSubLaneMask[Elt];
13074 int Idx = (SubLane * NumSubLaneElts) + Elt;
13075 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
13078 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
13080 // Shuffle each source sub-lane to its destination.
13081 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
13082 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
13083 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
13084 if (SrcSubLane < 0)
13086 for (int j = 0; j != NumSubLaneElts; ++j)
13087 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
13090 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
13094 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
13095 unsigned &ShuffleImm,
13096 ArrayRef<int> Mask) {
13097 int NumElts = VT.getVectorNumElements();
13098 assert(VT.getScalarSizeInBits() == 64 &&
13099 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
13100 "Unexpected data type for VSHUFPD");
13102 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
13103 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
13105 bool ShufpdMask = true;
13106 bool CommutableMask = true;
13107 for (int i = 0; i < NumElts; ++i) {
13108 if (Mask[i] == SM_SentinelUndef)
13112 int Val = (i & 6) + NumElts * (i & 1);
13113 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
13114 if (Mask[i] < Val || Mask[i] > Val + 1)
13115 ShufpdMask = false;
13116 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
13117 CommutableMask = false;
13118 ShuffleImm |= (Mask[i] % 2) << i;
13123 if (CommutableMask) {
13131 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
13132 ArrayRef<int> Mask, SDValue V1,
13133 SDValue V2, SelectionDAG &DAG) {
13134 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
13135 "Unexpected data type for VSHUFPD");
13137 unsigned Immediate = 0;
13138 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
13141 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13142 DAG.getConstant(Immediate, DL, MVT::i8));
13145 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
13147 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
13148 /// isn't available.
13149 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13150 const APInt &Zeroable,
13151 SDValue V1, SDValue V2,
13152 const X86Subtarget &Subtarget,
13153 SelectionDAG &DAG) {
13154 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13155 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13156 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13158 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
13159 Zeroable, Subtarget, DAG))
13162 if (V2.isUndef()) {
13163 // Check for being able to broadcast a single element.
13164 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13165 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13168 // Use low duplicate instructions for masks that match their pattern.
13169 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13170 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
13172 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
13173 // Non-half-crossing single input shuffles can be lowered with an
13174 // interleaved permutation.
13175 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13176 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
13177 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
13178 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13181 // With AVX2 we have direct support for this permutation.
13182 if (Subtarget.hasAVX2())
13183 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
13184 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13186 // Try to create an in-lane repeating shuffle mask and then shuffle the
13187 // the results into the target lanes.
13188 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13189 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13192 // Otherwise, fall back.
13193 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
13197 // Use dedicated unpack instructions for masks that match their pattern.
13199 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
13202 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
13203 Zeroable, Subtarget, DAG))
13206 // Check if the blend happens to exactly fit that of SHUFPD.
13208 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
13211 // Try to create an in-lane repeating shuffle mask and then shuffle the
13212 // the results into the target lanes.
13213 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13214 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13217 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13218 // shuffle. However, if we have AVX2 and either inputs are already in place,
13219 // we will be able to shuffle even across lanes the other input in a single
13220 // instruction so skip this pattern.
13221 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
13222 isShuffleMaskInputInPlace(1, Mask))))
13223 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13224 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13226 // If we have VLX support, we can use VEXPAND.
13227 if (Subtarget.hasVLX())
13228 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
13229 V1, V2, DAG, Subtarget))
13232 // If we have AVX2 then we always want to lower with a blend because an v4 we
13233 // can fully permute the elements.
13234 if (Subtarget.hasAVX2())
13235 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
13238 // Otherwise fall back on generic lowering.
13239 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
13242 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
13244 /// This routine is only called when we have AVX2 and thus a reasonable
13245 /// instruction set for v4i64 shuffling..
13246 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13247 const APInt &Zeroable,
13248 SDValue V1, SDValue V2,
13249 const X86Subtarget &Subtarget,
13250 SelectionDAG &DAG) {
13251 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13252 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13253 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13254 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
13256 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
13257 Zeroable, Subtarget, DAG))
13260 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
13261 Zeroable, Subtarget, DAG))
13264 // Check for being able to broadcast a single element.
13265 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
13266 Mask, Subtarget, DAG))
13269 if (V2.isUndef()) {
13270 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13271 // can use lower latency instructions that will operate on both lanes.
13272 SmallVector<int, 2> RepeatedMask;
13273 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
13274 SmallVector<int, 4> PSHUFDMask;
13275 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
13276 return DAG.getBitcast(
13278 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
13279 DAG.getBitcast(MVT::v8i32, V1),
13280 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13283 // AVX2 provides a direct instruction for permuting a single input across
13285 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
13286 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13289 // Try to use shift instructions.
13290 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
13291 Zeroable, Subtarget, DAG))
13294 // If we have VLX support, we can use VALIGN or VEXPAND.
13295 if (Subtarget.hasVLX()) {
13296 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
13297 Mask, Subtarget, DAG))
13300 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
13301 V1, V2, DAG, Subtarget))
13305 // Try to use PALIGNR.
13306 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
13307 Mask, Subtarget, DAG))
13310 // Use dedicated unpack instructions for masks that match their pattern.
13312 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
13315 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13316 // shuffle. However, if we have AVX2 and either inputs are already in place,
13317 // we will be able to shuffle even across lanes the other input in a single
13318 // instruction so skip this pattern.
13319 if (!isShuffleMaskInputInPlace(0, Mask) &&
13320 !isShuffleMaskInputInPlace(1, Mask))
13321 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13322 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13325 // Otherwise fall back on generic blend lowering.
13326 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
13330 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
13332 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
13333 /// isn't available.
13334 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13335 const APInt &Zeroable,
13336 SDValue V1, SDValue V2,
13337 const X86Subtarget &Subtarget,
13338 SelectionDAG &DAG) {
13339 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13340 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13341 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13343 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
13344 Zeroable, Subtarget, DAG))
13347 // Check for being able to broadcast a single element.
13348 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
13349 Mask, Subtarget, DAG))
13352 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13353 // options to efficiently lower the shuffle.
13354 SmallVector<int, 4> RepeatedMask;
13355 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
13356 assert(RepeatedMask.size() == 4 &&
13357 "Repeated masks must be half the mask width!");
13359 // Use even/odd duplicate instructions for masks that match their pattern.
13360 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13361 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
13362 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13363 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
13366 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
13367 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13369 // Use dedicated unpack instructions for masks that match their pattern.
13371 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
13374 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
13375 // have already handled any direct blends.
13376 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
13379 // Try to create an in-lane repeating shuffle mask and then shuffle the
13380 // the results into the target lanes.
13381 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13382 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13385 // If we have a single input shuffle with different shuffle patterns in the
13386 // two 128-bit lanes use the variable mask to VPERMILPS.
13387 if (V2.isUndef()) {
13388 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13389 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
13390 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
13392 if (Subtarget.hasAVX2())
13393 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
13395 // Otherwise, fall back.
13396 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
13400 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13402 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13403 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13405 // If we have VLX support, we can use VEXPAND.
13406 if (Subtarget.hasVLX())
13407 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
13408 V1, V2, DAG, Subtarget))
13411 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13412 // since after split we get a more efficient code using vpunpcklwd and
13413 // vpunpckhwd instrs than vblend.
13414 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
13415 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
13419 // If we have AVX2 then we always want to lower with a blend because at v8 we
13420 // can fully permute the elements.
13421 if (Subtarget.hasAVX2())
13422 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
13425 // Otherwise fall back on generic lowering.
13426 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
13429 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
13431 /// This routine is only called when we have AVX2 and thus a reasonable
13432 /// instruction set for v8i32 shuffling..
13433 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13434 const APInt &Zeroable,
13435 SDValue V1, SDValue V2,
13436 const X86Subtarget &Subtarget,
13437 SelectionDAG &DAG) {
13438 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13439 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13440 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13441 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
13443 // Whenever we can lower this as a zext, that instruction is strictly faster
13444 // than any alternative. It also allows us to fold memory operands into the
13445 // shuffle in many cases.
13446 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13447 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13450 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13451 // since after split we get a more efficient code than vblend by using
13452 // vpunpcklwd and vpunpckhwd instrs.
13453 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
13454 !Subtarget.hasAVX512())
13456 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
13459 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
13460 Zeroable, Subtarget, DAG))
13463 // Check for being able to broadcast a single element.
13464 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
13465 Mask, Subtarget, DAG))
13468 // If the shuffle mask is repeated in each 128-bit lane we can use more
13469 // efficient instructions that mirror the shuffles across the two 128-bit
13471 SmallVector<int, 4> RepeatedMask;
13472 bool Is128BitLaneRepeatedShuffle =
13473 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
13474 if (Is128BitLaneRepeatedShuffle) {
13475 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13477 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
13478 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13480 // Use dedicated unpack instructions for masks that match their pattern.
13482 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13486 // Try to use shift instructions.
13487 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13488 Zeroable, Subtarget, DAG))
13491 // If we have VLX support, we can use VALIGN or EXPAND.
13492 if (Subtarget.hasVLX()) {
13493 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13494 Mask, Subtarget, DAG))
13497 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13498 V1, V2, DAG, Subtarget))
13502 // Try to use byte rotation instructions.
13503 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13504 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13507 // Try to create an in-lane repeating shuffle mask and then shuffle the
13508 // results into the target lanes.
13509 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13510 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13513 // If the shuffle patterns aren't repeated but it is a single input, directly
13514 // generate a cross-lane VPERMD instruction.
13515 if (V2.isUndef()) {
13516 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13517 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
13520 // Assume that a single SHUFPS is faster than an alternative sequence of
13521 // multiple instructions (even if the CPU has a domain penalty).
13522 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13523 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13524 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
13525 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
13526 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13527 CastV1, CastV2, DAG);
13528 return DAG.getBitcast(MVT::v8i32, ShufPS);
13531 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13533 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13534 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13537 // Otherwise fall back on generic blend lowering.
13538 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13542 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13544 /// This routine is only called when we have AVX2 and thus a reasonable
13545 /// instruction set for v16i16 shuffling..
13546 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13547 const APInt &Zeroable,
13548 SDValue V1, SDValue V2,
13549 const X86Subtarget &Subtarget,
13550 SelectionDAG &DAG) {
13551 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13552 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13553 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13554 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
13556 // Whenever we can lower this as a zext, that instruction is strictly faster
13557 // than any alternative. It also allows us to fold memory operands into the
13558 // shuffle in many cases.
13559 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13560 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13563 // Check for being able to broadcast a single element.
13564 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13565 Mask, Subtarget, DAG))
13568 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13569 Zeroable, Subtarget, DAG))
13572 // Use dedicated unpack instructions for masks that match their pattern.
13574 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13577 // Use dedicated pack instructions for masks that match their pattern.
13578 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
13582 // Try to use shift instructions.
13583 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13584 Zeroable, Subtarget, DAG))
13587 // Try to use byte rotation instructions.
13588 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13589 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13592 // Try to create an in-lane repeating shuffle mask and then shuffle the
13593 // the results into the target lanes.
13594 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13595 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13598 if (V2.isUndef()) {
13599 // There are no generalized cross-lane shuffle operations available on i16
13601 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13602 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13603 Mask, DAG, Subtarget);
13605 SmallVector<int, 8> RepeatedMask;
13606 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13607 // As this is a single-input shuffle, the repeated mask should be
13608 // a strictly valid v8i16 mask that we can pass through to the v8i16
13609 // lowering to handle even the v16 case.
13610 return lowerV8I16GeneralSingleInputVectorShuffle(
13611 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13615 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13616 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13619 // AVX512BWVL can lower to VPERMW.
13620 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13621 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13623 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13625 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13626 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13629 // Otherwise fall back on generic lowering.
13630 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13633 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13635 /// This routine is only called when we have AVX2 and thus a reasonable
13636 /// instruction set for v32i8 shuffling..
13637 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13638 const APInt &Zeroable,
13639 SDValue V1, SDValue V2,
13640 const X86Subtarget &Subtarget,
13641 SelectionDAG &DAG) {
13642 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13643 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13644 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13645 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13647 // Whenever we can lower this as a zext, that instruction is strictly faster
13648 // than any alternative. It also allows us to fold memory operands into the
13649 // shuffle in many cases.
13650 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13651 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13654 // Check for being able to broadcast a single element.
13655 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13656 Mask, Subtarget, DAG))
13659 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13660 Zeroable, Subtarget, DAG))
13663 // Use dedicated unpack instructions for masks that match their pattern.
13665 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13668 // Use dedicated pack instructions for masks that match their pattern.
13669 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
13673 // Try to use shift instructions.
13674 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13675 Zeroable, Subtarget, DAG))
13678 // Try to use byte rotation instructions.
13679 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13680 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13683 // Try to create an in-lane repeating shuffle mask and then shuffle the
13684 // the results into the target lanes.
13685 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13686 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13689 // There are no generalized cross-lane shuffle operations available on i8
13691 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13692 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13695 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13696 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13699 // AVX512VBMIVL can lower to VPERMB.
13700 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
13701 return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
13703 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13705 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13706 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13709 // Otherwise fall back on generic lowering.
13710 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13713 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13715 /// This routine either breaks down the specific type of a 256-bit x86 vector
13716 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13717 /// together based on the available instructions.
13718 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13719 MVT VT, SDValue V1, SDValue V2,
13720 const APInt &Zeroable,
13721 const X86Subtarget &Subtarget,
13722 SelectionDAG &DAG) {
13723 // If we have a single input to the zero element, insert that into V1 if we
13724 // can do so cheaply.
13725 int NumElts = VT.getVectorNumElements();
13726 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13728 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13729 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13730 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13733 // Handle special cases where the lower or upper half is UNDEF.
13735 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13738 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13739 // can check for those subtargets here and avoid much of the subtarget
13740 // querying in the per-vector-type lowering routines. With AVX1 we have
13741 // essentially *zero* ability to manipulate a 256-bit vector with integer
13742 // types. Since we'll use floating point types there eventually, just
13743 // immediately cast everything to a float and operate entirely in that domain.
13744 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13745 int ElementBits = VT.getScalarSizeInBits();
13746 if (ElementBits < 32) {
13747 // No floating point type available, if we can't use the bit operations
13748 // for masking/blending then decompose into 128-bit vectors.
13750 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13752 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13754 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13757 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13758 VT.getVectorNumElements());
13759 V1 = DAG.getBitcast(FpVT, V1);
13760 V2 = DAG.getBitcast(FpVT, V2);
13761 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13764 switch (VT.SimpleTy) {
13766 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13768 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13770 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13772 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13774 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13776 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13779 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13783 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13784 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13785 ArrayRef<int> Mask, SDValue V1,
13786 SDValue V2, SelectionDAG &DAG) {
13787 assert(VT.getScalarSizeInBits() == 64 &&
13788 "Unexpected element type size for 128bit shuffle.");
13790 // To handle 256 bit vector requires VLX and most probably
13791 // function lowerV2X128VectorShuffle() is better solution.
13792 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13794 SmallVector<int, 4> WidenedMask;
13795 if (!canWidenShuffleElements(Mask, WidenedMask))
13798 // Check for patterns which can be matched with a single insert of a 256-bit
13800 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13801 {0, 1, 2, 3, 0, 1, 2, 3});
13802 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13803 {0, 1, 2, 3, 8, 9, 10, 11})) {
13804 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13805 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13806 DAG.getIntPtrConstant(0, DL));
13807 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13808 OnlyUsesV1 ? V1 : V2,
13809 DAG.getIntPtrConstant(0, DL));
13810 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13813 assert(WidenedMask.size() == 4);
13815 // See if this is an insertion of the lower 128-bits of V2 into V1.
13816 bool IsInsert = true;
13818 for (int i = 0; i < 4; ++i) {
13819 assert(WidenedMask[i] >= -1);
13820 if (WidenedMask[i] < 0)
13823 // Make sure all V1 subvectors are in place.
13824 if (WidenedMask[i] < 4) {
13825 if (WidenedMask[i] != i) {
13830 // Make sure we only have a single V2 index and its the lowest 128-bits.
13831 if (V2Index >= 0 || WidenedMask[i] != 4) {
13838 if (IsInsert && V2Index >= 0) {
13839 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13840 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13841 DAG.getIntPtrConstant(0, DL));
13842 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13845 // Try to lower to to vshuf64x2/vshuf32x4.
13846 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13847 unsigned PermMask = 0;
13848 // Insure elements came from the same Op.
13849 for (int i = 0; i < 4; ++i) {
13850 assert(WidenedMask[i] >= -1);
13851 if (WidenedMask[i] < 0)
13854 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13855 unsigned OpIndex = i / 2;
13856 if (Ops[OpIndex].isUndef())
13858 else if (Ops[OpIndex] != Op)
13861 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13862 // bits defined by a vshuf64x2 instruction's immediate control byte.
13863 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13866 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13867 DAG.getConstant(PermMask, DL, MVT::i8));
13870 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13871 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13872 const APInt &Zeroable,
13873 SDValue V1, SDValue V2,
13874 const X86Subtarget &Subtarget,
13875 SelectionDAG &DAG) {
13876 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13877 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13878 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13880 if (V2.isUndef()) {
13881 // Use low duplicate instructions for masks that match their pattern.
13882 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13883 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13885 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13886 // Non-half-crossing single input shuffles can be lowered with an
13887 // interleaved permutation.
13888 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13889 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13890 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13891 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13892 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13893 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13896 SmallVector<int, 4> RepeatedMask;
13897 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13898 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13899 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13902 if (SDValue Shuf128 =
13903 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13906 if (SDValue Unpck =
13907 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13910 // Check if the blend happens to exactly fit that of SHUFPD.
13912 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13915 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13916 V2, DAG, Subtarget))
13919 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13920 Zeroable, Subtarget, DAG))
13923 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13926 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13927 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13928 const APInt &Zeroable,
13929 SDValue V1, SDValue V2,
13930 const X86Subtarget &Subtarget,
13931 SelectionDAG &DAG) {
13932 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13933 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13934 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13936 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13937 // options to efficiently lower the shuffle.
13938 SmallVector<int, 4> RepeatedMask;
13939 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13940 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13942 // Use even/odd duplicate instructions for masks that match their pattern.
13943 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13944 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13945 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13946 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13949 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13950 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13952 // Use dedicated unpack instructions for masks that match their pattern.
13953 if (SDValue Unpck =
13954 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13957 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13958 Zeroable, Subtarget, DAG))
13961 // Otherwise, fall back to a SHUFPS sequence.
13962 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13965 // If we have a single input shuffle with different shuffle patterns in the
13966 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
13967 if (V2.isUndef() &&
13968 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
13969 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
13970 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
13973 // If we have AVX512F support, we can use VEXPAND.
13974 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13975 V1, V2, DAG, Subtarget))
13978 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13981 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13982 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13983 const APInt &Zeroable,
13984 SDValue V1, SDValue V2,
13985 const X86Subtarget &Subtarget,
13986 SelectionDAG &DAG) {
13987 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13988 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13989 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13991 if (V2.isUndef()) {
13992 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13993 // can use lower latency instructions that will operate on all four
13995 SmallVector<int, 2> Repeated128Mask;
13996 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13997 SmallVector<int, 4> PSHUFDMask;
13998 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
13999 return DAG.getBitcast(
14001 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
14002 DAG.getBitcast(MVT::v16i32, V1),
14003 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14006 SmallVector<int, 4> Repeated256Mask;
14007 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
14008 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
14009 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
14012 if (SDValue Shuf128 =
14013 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
14016 // Try to use shift instructions.
14017 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
14018 Zeroable, Subtarget, DAG))
14021 // Try to use VALIGN.
14022 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
14023 Mask, Subtarget, DAG))
14026 // Try to use PALIGNR.
14027 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
14028 Mask, Subtarget, DAG))
14031 if (SDValue Unpck =
14032 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
14034 // If we have AVX512F support, we can use VEXPAND.
14035 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
14036 V2, DAG, Subtarget))
14039 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
14040 Zeroable, Subtarget, DAG))
14043 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
14046 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
14047 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14048 const APInt &Zeroable,
14049 SDValue V1, SDValue V2,
14050 const X86Subtarget &Subtarget,
14051 SelectionDAG &DAG) {
14052 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14053 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14054 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14056 // Whenever we can lower this as a zext, that instruction is strictly faster
14057 // than any alternative. It also allows us to fold memory operands into the
14058 // shuffle in many cases.
14059 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14060 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14063 // If the shuffle mask is repeated in each 128-bit lane we can use more
14064 // efficient instructions that mirror the shuffles across the four 128-bit
14066 SmallVector<int, 4> RepeatedMask;
14067 bool Is128BitLaneRepeatedShuffle =
14068 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
14069 if (Is128BitLaneRepeatedShuffle) {
14070 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14072 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
14073 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14075 // Use dedicated unpack instructions for masks that match their pattern.
14077 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
14081 // Try to use shift instructions.
14082 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
14083 Zeroable, Subtarget, DAG))
14086 // Try to use VALIGN.
14087 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
14088 Mask, Subtarget, DAG))
14091 // Try to use byte rotation instructions.
14092 if (Subtarget.hasBWI())
14093 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14094 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
14097 // Assume that a single SHUFPS is faster than using a permv shuffle.
14098 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14099 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14100 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
14101 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
14102 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
14103 CastV1, CastV2, DAG);
14104 return DAG.getBitcast(MVT::v16i32, ShufPS);
14106 // If we have AVX512F support, we can use VEXPAND.
14107 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
14108 V1, V2, DAG, Subtarget))
14111 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
14112 Zeroable, Subtarget, DAG))
14114 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
14117 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
14118 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14119 const APInt &Zeroable,
14120 SDValue V1, SDValue V2,
14121 const X86Subtarget &Subtarget,
14122 SelectionDAG &DAG) {
14123 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14124 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14125 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14126 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
14128 // Whenever we can lower this as a zext, that instruction is strictly faster
14129 // than any alternative. It also allows us to fold memory operands into the
14130 // shuffle in many cases.
14131 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14132 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14135 // Use dedicated unpack instructions for masks that match their pattern.
14137 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
14140 // Try to use shift instructions.
14141 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
14142 Zeroable, Subtarget, DAG))
14145 // Try to use byte rotation instructions.
14146 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14147 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
14150 if (V2.isUndef()) {
14151 SmallVector<int, 8> RepeatedMask;
14152 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
14153 // As this is a single-input shuffle, the repeated mask should be
14154 // a strictly valid v8i16 mask that we can pass through to the v8i16
14155 // lowering to handle even the v32 case.
14156 return lowerV8I16GeneralSingleInputVectorShuffle(
14157 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
14161 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
14162 Zeroable, Subtarget, DAG))
14165 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14166 DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14169 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
14172 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
14173 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14174 const APInt &Zeroable,
14175 SDValue V1, SDValue V2,
14176 const X86Subtarget &Subtarget,
14177 SelectionDAG &DAG) {
14178 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14179 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14180 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
14181 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
14183 // Whenever we can lower this as a zext, that instruction is strictly faster
14184 // than any alternative. It also allows us to fold memory operands into the
14185 // shuffle in many cases.
14186 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14187 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14190 // Use dedicated unpack instructions for masks that match their pattern.
14192 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
14195 // Try to use shift instructions.
14196 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
14197 Zeroable, Subtarget, DAG))
14200 // Try to use byte rotation instructions.
14201 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14202 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14205 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14206 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14209 // VBMI can use VPERMV/VPERMV3 byte shuffles.
14210 if (Subtarget.hasVBMI())
14211 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
14213 // Try to create an in-lane repeating shuffle mask and then shuffle the
14214 // the results into the target lanes.
14215 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14216 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14219 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
14220 Zeroable, Subtarget, DAG))
14223 // FIXME: Implement direct support for this type!
14224 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
14227 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
14229 /// This routine either breaks down the specific type of a 512-bit x86 vector
14230 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
14231 /// together based on the available instructions.
14232 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14233 MVT VT, SDValue V1, SDValue V2,
14234 const APInt &Zeroable,
14235 const X86Subtarget &Subtarget,
14236 SelectionDAG &DAG) {
14237 assert(Subtarget.hasAVX512() &&
14238 "Cannot lower 512-bit vectors w/ basic ISA!");
14240 // If we have a single input to the zero element, insert that into V1 if we
14241 // can do so cheaply.
14242 int NumElts = Mask.size();
14243 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14245 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14246 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14247 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14250 // Handle special cases where the lower or upper half is UNDEF.
14252 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14255 // Check for being able to broadcast a single element.
14256 if (SDValue Broadcast =
14257 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
14260 // Dispatch to each element type for lowering. If we don't have support for
14261 // specific element type shuffles at 512 bits, immediately split them and
14262 // lower them. Each lowering routine of a given type is allowed to assume that
14263 // the requisite ISA extensions for that element type are available.
14264 switch (VT.SimpleTy) {
14266 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14268 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14270 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14272 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14274 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14276 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14279 llvm_unreachable("Not a valid 512-bit x86 vector type!");
14283 // Lower vXi1 vector shuffles.
14284 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
14285 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
14286 // vector, shuffle and then truncate it back.
14287 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14288 MVT VT, SDValue V1, SDValue V2,
14289 const X86Subtarget &Subtarget,
14290 SelectionDAG &DAG) {
14291 assert(Subtarget.hasAVX512() &&
14292 "Cannot lower 512-bit vectors w/o basic ISA!");
14294 switch (VT.SimpleTy) {
14296 llvm_unreachable("Expected a vector of i1 elements");
14298 ExtVT = MVT::v2i64;
14301 ExtVT = MVT::v4i32;
14304 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
14306 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
14309 ExtVT = MVT::v16i32;
14312 ExtVT = MVT::v32i16;
14315 ExtVT = MVT::v64i8;
14319 if (ISD::isBuildVectorAllZeros(V1.getNode()))
14320 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
14321 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
14322 V1 = getOnesVector(ExtVT, DAG, DL);
14324 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
14327 V2 = DAG.getUNDEF(ExtVT);
14328 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
14329 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
14330 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
14331 V2 = getOnesVector(ExtVT, DAG, DL);
14333 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
14335 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
14336 // i1 was sign extended we can use X86ISD::CVT2MASK.
14337 int NumElems = VT.getVectorNumElements();
14338 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
14339 (Subtarget.hasDQI() && (NumElems < 32)))
14340 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
14342 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
14345 /// Helper function that returns true if the shuffle mask should be
14346 /// commuted to improve canonicalization.
14347 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
14348 int NumElements = Mask.size();
14350 int NumV1Elements = 0, NumV2Elements = 0;
14354 else if (M < NumElements)
14359 // Commute the shuffle as needed such that more elements come from V1 than
14360 // V2. This allows us to match the shuffle pattern strictly on how many
14361 // elements come from V1 without handling the symmetric cases.
14362 if (NumV2Elements > NumV1Elements)
14365 assert(NumV1Elements > 0 && "No V1 indices");
14367 if (NumV2Elements == 0)
14370 // When the number of V1 and V2 elements are the same, try to minimize the
14371 // number of uses of V2 in the low half of the vector. When that is tied,
14372 // ensure that the sum of indices for V1 is equal to or lower than the sum
14373 // indices for V2. When those are equal, try to ensure that the number of odd
14374 // indices for V1 is lower than the number of odd indices for V2.
14375 if (NumV1Elements == NumV2Elements) {
14376 int LowV1Elements = 0, LowV2Elements = 0;
14377 for (int M : Mask.slice(0, NumElements / 2))
14378 if (M >= NumElements)
14382 if (LowV2Elements > LowV1Elements)
14384 if (LowV2Elements == LowV1Elements) {
14385 int SumV1Indices = 0, SumV2Indices = 0;
14386 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14387 if (Mask[i] >= NumElements)
14389 else if (Mask[i] >= 0)
14391 if (SumV2Indices < SumV1Indices)
14393 if (SumV2Indices == SumV1Indices) {
14394 int NumV1OddIndices = 0, NumV2OddIndices = 0;
14395 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14396 if (Mask[i] >= NumElements)
14397 NumV2OddIndices += i % 2;
14398 else if (Mask[i] >= 0)
14399 NumV1OddIndices += i % 2;
14400 if (NumV2OddIndices < NumV1OddIndices)
14409 /// \brief Top-level lowering for x86 vector shuffles.
14411 /// This handles decomposition, canonicalization, and lowering of all x86
14412 /// vector shuffles. Most of the specific lowering strategies are encapsulated
14413 /// above in helper routines. The canonicalization attempts to widen shuffles
14414 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
14415 /// s.t. only one of the two inputs needs to be tested, etc.
14416 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
14417 SelectionDAG &DAG) {
14418 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
14419 ArrayRef<int> Mask = SVOp->getMask();
14420 SDValue V1 = Op.getOperand(0);
14421 SDValue V2 = Op.getOperand(1);
14422 MVT VT = Op.getSimpleValueType();
14423 int NumElements = VT.getVectorNumElements();
14425 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
14427 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
14428 "Can't lower MMX shuffles");
14430 bool V1IsUndef = V1.isUndef();
14431 bool V2IsUndef = V2.isUndef();
14432 if (V1IsUndef && V2IsUndef)
14433 return DAG.getUNDEF(VT);
14435 // When we create a shuffle node we put the UNDEF node to second operand,
14436 // but in some cases the first operand may be transformed to UNDEF.
14437 // In this case we should just commute the node.
14439 return DAG.getCommutedVectorShuffle(*SVOp);
14441 // Check for non-undef masks pointing at an undef vector and make the masks
14442 // undef as well. This makes it easier to match the shuffle based solely on
14446 if (M >= NumElements) {
14447 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
14448 for (int &M : NewMask)
14449 if (M >= NumElements)
14451 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14454 // Check for illegal shuffle mask element index values.
14455 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
14456 assert(llvm::all_of(Mask,
14457 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
14458 "Out of bounds shuffle index");
14460 // We actually see shuffles that are entirely re-arrangements of a set of
14461 // zero inputs. This mostly happens while decomposing complex shuffles into
14462 // simple ones. Directly lower these as a buildvector of zeros.
14463 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
14464 if (Zeroable.isAllOnesValue())
14465 return getZeroVector(VT, Subtarget, DAG, DL);
14467 // Try to collapse shuffles into using a vector type with fewer elements but
14468 // wider element types. We cap this to not form integers or floating point
14469 // elements wider than 64 bits, but it might be interesting to form i128
14470 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
14471 SmallVector<int, 16> WidenedMask;
14472 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
14473 canWidenShuffleElements(Mask, WidenedMask)) {
14474 MVT NewEltVT = VT.isFloatingPoint()
14475 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
14476 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
14477 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14478 // Make sure that the new vector type is legal. For example, v2f64 isn't
14480 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14481 V1 = DAG.getBitcast(NewVT, V1);
14482 V2 = DAG.getBitcast(NewVT, V2);
14483 return DAG.getBitcast(
14484 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
14488 // Commute the shuffle if it will improve canonicalization.
14489 if (canonicalizeShuffleMaskWithCommute(Mask))
14490 return DAG.getCommutedVectorShuffle(*SVOp);
14492 // For each vector width, delegate to a specialized lowering routine.
14493 if (VT.is128BitVector())
14494 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14497 if (VT.is256BitVector())
14498 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14501 if (VT.is512BitVector())
14502 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14506 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
14508 llvm_unreachable("Unimplemented!");
14511 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
14512 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
14513 const X86Subtarget &Subtarget,
14514 SelectionDAG &DAG) {
14515 SDValue Cond = Op.getOperand(0);
14516 SDValue LHS = Op.getOperand(1);
14517 SDValue RHS = Op.getOperand(2);
14519 MVT VT = Op.getSimpleValueType();
14521 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
14523 auto *CondBV = cast<BuildVectorSDNode>(Cond);
14525 // Only non-legal VSELECTs reach this lowering, convert those into generic
14526 // shuffles and re-use the shuffle lowering path for blends.
14527 SmallVector<int, 32> Mask;
14528 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
14529 SDValue CondElt = CondBV->getOperand(i);
14531 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
14534 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
14537 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
14538 // A vselect where all conditions and data are constants can be optimized into
14539 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
14540 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
14541 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
14542 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
14545 // Try to lower this to a blend-style vector shuffle. This can handle all
14546 // constant condition cases.
14547 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
14550 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
14551 // with patterns on the mask registers on AVX-512.
14552 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
14555 // Variable blends are only legal from SSE4.1 onward.
14556 if (!Subtarget.hasSSE41())
14560 MVT VT = Op.getSimpleValueType();
14562 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14563 // into an i1 condition so that we can use the mask-based 512-bit blend
14565 if (VT.getSizeInBits() == 512) {
14566 SDValue Cond = Op.getOperand(0);
14567 // The vNi1 condition case should be handled above as it can be trivially
14569 assert(Cond.getValueType().getScalarSizeInBits() ==
14570 VT.getScalarSizeInBits() &&
14571 "Should have a size-matched integer condition!");
14572 // Build a mask by testing the condition against itself (tests for zero).
14573 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14574 SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
14575 // Now return a new VSELECT using the mask.
14576 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14579 // Only some types will be legal on some subtargets. If we can emit a legal
14580 // VSELECT-matching blend, return Op, and but if we need to expand, return
14582 switch (VT.SimpleTy) {
14584 // Most of the vector types have blends past SSE4.1.
14588 // The byte blends for AVX vectors were introduced only in AVX2.
14589 if (Subtarget.hasAVX2())
14596 // FIXME: We should custom lower this by fixing the condition and using i8
14602 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14603 MVT VT = Op.getSimpleValueType();
14606 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14609 if (VT.getSizeInBits() == 8) {
14610 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14611 Op.getOperand(0), Op.getOperand(1));
14612 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
14615 if (VT == MVT::f32) {
14616 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14617 // the result back to FR32 register. It's only worth matching if the
14618 // result has a single use which is a store or a bitcast to i32. And in
14619 // the case of a store, it's not worth it if the index is a constant 0,
14620 // because a MOVSSmr can be used instead, which is smaller and faster.
14621 if (!Op.hasOneUse())
14623 SDNode *User = *Op.getNode()->use_begin();
14624 if ((User->getOpcode() != ISD::STORE ||
14625 isNullConstant(Op.getOperand(1))) &&
14626 (User->getOpcode() != ISD::BITCAST ||
14627 User->getValueType(0) != MVT::i32))
14629 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14630 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14632 return DAG.getBitcast(MVT::f32, Extract);
14635 if (VT == MVT::i32 || VT == MVT::i64) {
14636 // ExtractPS/pextrq works with constant index.
14637 if (isa<ConstantSDNode>(Op.getOperand(1)))
14644 /// Extract one bit from mask vector, like v16i1 or v8i1.
14645 /// AVX-512 feature.
14646 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
14647 const X86Subtarget &Subtarget) {
14648 SDValue Vec = Op.getOperand(0);
14650 MVT VecVT = Vec.getSimpleValueType();
14651 SDValue Idx = Op.getOperand(1);
14652 MVT EltVT = Op.getSimpleValueType();
14654 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
14655 "Unexpected vector type in ExtractBitFromMaskVector");
14657 // variable index can't be handled in mask registers,
14658 // extend vector to VR512/128
14659 if (!isa<ConstantSDNode>(Idx)) {
14660 unsigned NumElts = VecVT.getVectorNumElements();
14661 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14662 // than extending to 128/256bit.
14663 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
14664 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
14665 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
14666 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
14667 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14670 // Canonicalize result type to MVT::i32.
14671 if (EltVT != MVT::i32) {
14672 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14674 return DAG.getAnyExtOrTrunc(Extract, dl, EltVT);
14677 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14679 // Extracts from element 0 are always allowed.
14683 // If the kshift instructions of the correct width aren't natively supported
14684 // then we need to promote the vector to the native size to get the correct
14685 // zeroing behavior.
14686 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14687 (VecVT.getVectorNumElements() < 8)) {
14688 VecVT = MVT::v16i1;
14689 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14690 DAG.getUNDEF(VecVT),
14692 DAG.getIntPtrConstant(0, dl));
14695 // Use kshiftr instruction to move to the lower element.
14696 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14697 DAG.getConstant(IdxVal, dl, MVT::i8));
14698 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec,
14699 DAG.getIntPtrConstant(0, dl));
14703 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14704 SelectionDAG &DAG) const {
14706 SDValue Vec = Op.getOperand(0);
14707 MVT VecVT = Vec.getSimpleValueType();
14708 SDValue Idx = Op.getOperand(1);
14710 if (VecVT.getVectorElementType() == MVT::i1)
14711 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
14713 if (!isa<ConstantSDNode>(Idx)) {
14714 // Its more profitable to go through memory (1 cycles throughput)
14715 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14716 // IACA tool was used to get performance estimation
14717 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14719 // example : extractelement <16 x i8> %a, i32 %i
14721 // Block Throughput: 3.00 Cycles
14722 // Throughput Bottleneck: Port5
14724 // | Num Of | Ports pressure in cycles | |
14725 // | Uops | 0 - DV | 5 | 6 | 7 | |
14726 // ---------------------------------------------
14727 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14728 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14729 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14730 // Total Num Of Uops: 4
14733 // Block Throughput: 1.00 Cycles
14734 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14736 // | | Ports pressure in cycles | |
14737 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14738 // ---------------------------------------------------------
14739 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14740 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14741 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14742 // Total Num Of Uops: 4
14747 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14749 // If this is a 256-bit vector result, first extract the 128-bit vector and
14750 // then extract the element from the 128-bit vector.
14751 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14752 // Get the 128-bit vector.
14753 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14754 MVT EltVT = VecVT.getVectorElementType();
14756 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14757 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14759 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14760 // this can be done with a mask.
14761 IdxVal &= ElemsPerChunk - 1;
14762 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14763 DAG.getConstant(IdxVal, dl, MVT::i32));
14766 assert(VecVT.is128BitVector() && "Unexpected vector length");
14768 MVT VT = Op.getSimpleValueType();
14770 if (VT.getSizeInBits() == 16) {
14771 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14772 // we're going to zero extend the register or fold the store (SSE41 only).
14773 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14774 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14775 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14776 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14777 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14779 // Transform it so it match pextrw which produces a 32-bit result.
14780 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14781 Op.getOperand(0), Op.getOperand(1));
14782 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
14785 if (Subtarget.hasSSE41())
14786 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14789 // TODO: We only extract a single element from v16i8, we can probably afford
14790 // to be more aggressive here before using the default approach of spilling to
14792 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14793 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14794 int DWordIdx = IdxVal / 4;
14795 if (DWordIdx == 0) {
14796 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14797 DAG.getBitcast(MVT::v4i32, Vec),
14798 DAG.getIntPtrConstant(DWordIdx, dl));
14799 int ShiftVal = (IdxVal % 4) * 8;
14801 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14802 DAG.getConstant(ShiftVal, dl, MVT::i32));
14803 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14806 int WordIdx = IdxVal / 2;
14807 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14808 DAG.getBitcast(MVT::v8i16, Vec),
14809 DAG.getIntPtrConstant(WordIdx, dl));
14810 int ShiftVal = (IdxVal % 2) * 8;
14812 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14813 DAG.getConstant(ShiftVal, dl, MVT::i16));
14814 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14817 if (VT.getSizeInBits() == 32) {
14821 // SHUFPS the element to the lowest double word, then movss.
14822 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14823 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14824 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14825 DAG.getIntPtrConstant(0, dl));
14828 if (VT.getSizeInBits() == 64) {
14829 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14830 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14831 // to match extract_elt for f64.
14835 // UNPCKHPD the element to the lowest double word, then movsd.
14836 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14837 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14838 int Mask[2] = { 1, -1 };
14839 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14840 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14841 DAG.getIntPtrConstant(0, dl));
14847 /// Insert one bit to mask vector, like v16i1 or v8i1.
14848 /// AVX-512 feature.
14849 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
14850 const X86Subtarget &Subtarget) {
14852 SDValue Vec = Op.getOperand(0);
14853 SDValue Elt = Op.getOperand(1);
14854 SDValue Idx = Op.getOperand(2);
14855 MVT VecVT = Vec.getSimpleValueType();
14857 if (!isa<ConstantSDNode>(Idx)) {
14858 // Non constant index. Extend source and destination,
14859 // insert element and then truncate the result.
14860 unsigned NumElts = VecVT.getVectorNumElements();
14861 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
14862 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
14863 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14864 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
14865 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
14866 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14869 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14870 unsigned NumElems = VecVT.getVectorNumElements();
14872 // If the kshift instructions of the correct width aren't natively supported
14873 // then we need to promote the vector to the native size to get the correct
14874 // zeroing behavior.
14875 if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) {
14876 // Need to promote to v16i1, do the insert, then extract back.
14877 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
14878 DAG.getUNDEF(MVT::v16i1), Vec,
14879 DAG.getIntPtrConstant(0, dl));
14880 Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
14881 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
14882 DAG.getIntPtrConstant(0, dl));
14885 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14887 if (Vec.isUndef()) {
14889 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14890 DAG.getConstant(IdxVal, dl, MVT::i8));
14894 // Insertion of one bit into first position
14895 if (IdxVal == 0 ) {
14896 // Clean top bits of vector.
14897 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14898 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14899 EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14900 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14901 // Clean the first bit in source vector.
14902 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14903 DAG.getConstant(1 , dl, MVT::i8));
14904 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14905 DAG.getConstant(1, dl, MVT::i8));
14907 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14909 // Insertion of one bit into last position
14910 if (IdxVal == NumElems - 1) {
14911 // Move the bit to the last position inside the vector.
14912 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14913 DAG.getConstant(IdxVal, dl, MVT::i8));
14914 // Clean the last bit in the source vector.
14915 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14916 DAG.getConstant(1, dl, MVT::i8));
14917 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14918 DAG.getConstant(1 , dl, MVT::i8));
14920 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14923 // Move the current value of the bit to be replace to bit 0.
14924 SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14925 DAG.getConstant(IdxVal, dl, MVT::i8));
14926 // Xor with the new bit.
14927 Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);
14928 // Shift to MSB, filling bottom bits with 0.
14929 Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,
14930 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14931 // Shift to the final position, filling upper bits with 0.
14932 Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,
14933 DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));
14934 // Xor with original vector to cancel out the original bit value that's still
14936 return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);
14939 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14940 SelectionDAG &DAG) const {
14941 MVT VT = Op.getSimpleValueType();
14942 MVT EltVT = VT.getVectorElementType();
14943 unsigned NumElts = VT.getVectorNumElements();
14945 if (EltVT == MVT::i1)
14946 return InsertBitToMaskVector(Op, DAG, Subtarget);
14949 SDValue N0 = Op.getOperand(0);
14950 SDValue N1 = Op.getOperand(1);
14951 SDValue N2 = Op.getOperand(2);
14952 if (!isa<ConstantSDNode>(N2))
14954 auto *N2C = cast<ConstantSDNode>(N2);
14955 unsigned IdxVal = N2C->getZExtValue();
14957 bool IsZeroElt = X86::isZeroNode(N1);
14958 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14960 // If we are inserting a element, see if we can do this more efficiently with
14961 // a blend shuffle with a rematerializable vector than a costly integer
14963 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
14964 16 <= EltVT.getSizeInBits()) {
14965 SmallVector<int, 8> BlendMask;
14966 for (unsigned i = 0; i != NumElts; ++i)
14967 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14968 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14969 : getOnesVector(VT, DAG, dl);
14970 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14973 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14974 // into that, and then insert the subvector back into the result.
14975 if (VT.is256BitVector() || VT.is512BitVector()) {
14976 // With a 256-bit vector, we can insert into the zero element efficiently
14977 // using a blend if we have AVX or AVX2 and the right data type.
14978 if (VT.is256BitVector() && IdxVal == 0) {
14979 // TODO: It is worthwhile to cast integer to floating point and back
14980 // and incur a domain crossing penalty if that's what we'll end up
14981 // doing anyway after extracting to a 128-bit vector.
14982 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14983 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14984 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14985 N2 = DAG.getIntPtrConstant(1, dl);
14986 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14990 // Get the desired 128-bit vector chunk.
14991 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14993 // Insert the element into the desired chunk.
14994 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14995 assert(isPowerOf2_32(NumEltsIn128));
14996 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14997 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14999 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
15000 DAG.getConstant(IdxIn128, dl, MVT::i32));
15002 // Insert the changed part back into the bigger vector
15003 return insert128BitVector(N0, V, IdxVal, DAG, dl);
15005 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
15007 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
15008 // argument. SSE41 required for pinsrb.
15009 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
15011 if (VT == MVT::v8i16) {
15012 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
15013 Opc = X86ISD::PINSRW;
15015 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
15016 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
15017 Opc = X86ISD::PINSRB;
15020 if (N1.getValueType() != MVT::i32)
15021 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
15022 if (N2.getValueType() != MVT::i32)
15023 N2 = DAG.getIntPtrConstant(IdxVal, dl);
15024 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
15027 if (Subtarget.hasSSE41()) {
15028 if (EltVT == MVT::f32) {
15029 // Bits [7:6] of the constant are the source select. This will always be
15030 // zero here. The DAG Combiner may combine an extract_elt index into
15031 // these bits. For example (insert (extract, 3), 2) could be matched by
15032 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
15033 // Bits [5:4] of the constant are the destination select. This is the
15034 // value of the incoming immediate.
15035 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
15036 // combine either bitwise AND or insert of float 0.0 to set these bits.
15038 bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
15039 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
15040 // If this is an insertion of 32-bits into the low 32-bits of
15041 // a vector, we prefer to generate a blend with immediate rather
15042 // than an insertps. Blends are simpler operations in hardware and so
15043 // will always have equal or better performance than insertps.
15044 // But if optimizing for size and there's a load folding opportunity,
15045 // generate insertps because blendps does not have a 32-bit memory
15047 N2 = DAG.getIntPtrConstant(1, dl);
15048 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15049 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
15051 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
15052 // Create this as a scalar to vector..
15053 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15054 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
15057 // PINSR* works with constant index.
15058 if (EltVT == MVT::i32 || EltVT == MVT::i64)
15065 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
15066 SelectionDAG &DAG) {
15068 MVT OpVT = Op.getSimpleValueType();
15070 // It's always cheaper to replace a xor+movd with xorps and simplifies further
15072 if (X86::isZeroNode(Op.getOperand(0)))
15073 return getZeroVector(OpVT, Subtarget, DAG, dl);
15075 // If this is a 256-bit vector result, first insert into a 128-bit
15076 // vector and then insert into the 256-bit vector.
15077 if (!OpVT.is128BitVector()) {
15078 // Insert into a 128-bit vector.
15079 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
15080 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
15081 OpVT.getVectorNumElements() / SizeFactor);
15083 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
15085 // Insert the 128-bit vector.
15086 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
15088 assert(OpVT.is128BitVector() && "Expected an SSE type!");
15090 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
15091 if (OpVT == MVT::v4i32)
15094 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
15095 return DAG.getBitcast(
15096 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
15099 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
15100 // simple superregister reference or explicit instructions to insert
15101 // the upper bits of a vector.
15102 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15103 SelectionDAG &DAG) {
15104 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
15106 return insert1BitVector(Op, DAG, Subtarget);
15109 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15110 SelectionDAG &DAG) {
15111 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15112 "Only vXi1 extract_subvectors need custom lowering");
15115 SDValue Vec = Op.getOperand(0);
15116 SDValue Idx = Op.getOperand(1);
15118 if (!isa<ConstantSDNode>(Idx))
15121 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15122 if (IdxVal == 0) // the operation is legal
15125 MVT VecVT = Vec.getSimpleValueType();
15126 unsigned NumElems = VecVT.getVectorNumElements();
15128 // Extend to natively supported kshift.
15129 MVT WideVecVT = VecVT;
15130 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
15131 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
15132 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
15133 DAG.getUNDEF(WideVecVT), Vec,
15134 DAG.getIntPtrConstant(0, dl));
15137 // Shift to the LSB.
15138 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
15139 DAG.getConstant(IdxVal, dl, MVT::i8));
15141 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
15142 DAG.getIntPtrConstant(0, dl));
15145 // Returns the appropriate wrapper opcode for a global reference.
15146 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
15147 // References to absolute symbols are never PC-relative.
15148 if (GV && GV->isAbsoluteSymbolRef())
15149 return X86ISD::Wrapper;
15151 CodeModel::Model M = getTargetMachine().getCodeModel();
15152 if (Subtarget.isPICStyleRIPRel() &&
15153 (M == CodeModel::Small || M == CodeModel::Kernel))
15154 return X86ISD::WrapperRIP;
15156 return X86ISD::Wrapper;
15159 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
15160 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
15161 // one of the above mentioned nodes. It has to be wrapped because otherwise
15162 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
15163 // be used to form addressing mode. These wrapped nodes will be selected
15166 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
15167 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
15169 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15170 // global base reg.
15171 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15173 auto PtrVT = getPointerTy(DAG.getDataLayout());
15174 SDValue Result = DAG.getTargetConstantPool(
15175 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
15177 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15178 // With PIC, the address is actually $g + Offset.
15181 DAG.getNode(ISD::ADD, DL, PtrVT,
15182 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15188 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
15189 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
15191 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15192 // global base reg.
15193 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15195 auto PtrVT = getPointerTy(DAG.getDataLayout());
15196 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
15198 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15200 // With PIC, the address is actually $g + Offset.
15203 DAG.getNode(ISD::ADD, DL, PtrVT,
15204 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15210 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
15211 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
15213 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15214 // global base reg.
15215 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
15216 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
15218 auto PtrVT = getPointerTy(DAG.getDataLayout());
15219 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
15222 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15224 // With PIC, the address is actually $g + Offset.
15225 if (isPositionIndependent() && !Subtarget.is64Bit()) {
15227 DAG.getNode(ISD::ADD, DL, PtrVT,
15228 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15231 // For symbols that require a load from a stub to get the address, emit the
15233 if (isGlobalStubReference(OpFlag))
15234 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
15235 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15241 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
15242 // Create the TargetBlockAddressAddress node.
15243 unsigned char OpFlags =
15244 Subtarget.classifyBlockAddressReference();
15245 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
15246 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
15248 auto PtrVT = getPointerTy(DAG.getDataLayout());
15249 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
15250 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
15252 // With PIC, the address is actually $g + Offset.
15253 if (isGlobalRelativeToPICBase(OpFlags)) {
15254 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15255 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15261 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
15262 const SDLoc &dl, int64_t Offset,
15263 SelectionDAG &DAG) const {
15264 // Create the TargetGlobalAddress node, folding in the constant
15265 // offset if it is legal.
15266 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
15267 CodeModel::Model M = DAG.getTarget().getCodeModel();
15268 auto PtrVT = getPointerTy(DAG.getDataLayout());
15270 if (OpFlags == X86II::MO_NO_FLAG &&
15271 X86::isOffsetSuitableForCodeModel(Offset, M)) {
15272 // A direct static reference to a global.
15273 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
15276 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
15279 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
15281 // With PIC, the address is actually $g + Offset.
15282 if (isGlobalRelativeToPICBase(OpFlags)) {
15283 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15284 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15287 // For globals that require a load from a stub to get the address, emit the
15289 if (isGlobalStubReference(OpFlags))
15290 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
15291 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15293 // If there was a non-zero offset that we didn't fold, create an explicit
15294 // addition for it.
15296 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
15297 DAG.getConstant(Offset, dl, PtrVT));
15303 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
15304 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
15305 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
15306 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
15310 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
15311 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
15312 unsigned char OperandFlags, bool LocalDynamic = false) {
15313 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15314 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15316 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15317 GA->getValueType(0),
15321 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
15325 SDValue Ops[] = { Chain, TGA, *InFlag };
15326 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15328 SDValue Ops[] = { Chain, TGA };
15329 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15332 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
15333 MFI.setAdjustsStack(true);
15334 MFI.setHasCalls(true);
15336 SDValue Flag = Chain.getValue(1);
15337 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
15340 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
15342 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15345 SDLoc dl(GA); // ? function entry point might be better
15346 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15347 DAG.getNode(X86ISD::GlobalBaseReg,
15348 SDLoc(), PtrVT), InFlag);
15349 InFlag = Chain.getValue(1);
15351 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
15354 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
15356 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15358 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
15359 X86::RAX, X86II::MO_TLSGD);
15362 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
15368 // Get the start address of the TLS block for this module.
15369 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
15370 .getInfo<X86MachineFunctionInfo>();
15371 MFI->incNumLocalDynamicTLSAccesses();
15375 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
15376 X86II::MO_TLSLD, /*LocalDynamic=*/true);
15379 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15380 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
15381 InFlag = Chain.getValue(1);
15382 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
15383 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
15386 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
15390 unsigned char OperandFlags = X86II::MO_DTPOFF;
15391 unsigned WrapperKind = X86ISD::Wrapper;
15392 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15393 GA->getValueType(0),
15394 GA->getOffset(), OperandFlags);
15395 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15397 // Add x@dtpoff with the base.
15398 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
15401 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
15402 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15403 const EVT PtrVT, TLSModel::Model model,
15404 bool is64Bit, bool isPIC) {
15407 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
15408 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
15409 is64Bit ? 257 : 256));
15411 SDValue ThreadPointer =
15412 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
15413 MachinePointerInfo(Ptr));
15415 unsigned char OperandFlags = 0;
15416 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
15418 unsigned WrapperKind = X86ISD::Wrapper;
15419 if (model == TLSModel::LocalExec) {
15420 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
15421 } else if (model == TLSModel::InitialExec) {
15423 OperandFlags = X86II::MO_GOTTPOFF;
15424 WrapperKind = X86ISD::WrapperRIP;
15426 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
15429 llvm_unreachable("Unexpected model");
15432 // emit "addl x@ntpoff,%eax" (local exec)
15433 // or "addl x@indntpoff,%eax" (initial exec)
15434 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
15436 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
15437 GA->getOffset(), OperandFlags);
15438 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15440 if (model == TLSModel::InitialExec) {
15441 if (isPIC && !is64Bit) {
15442 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
15443 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15447 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
15448 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15451 // The address of the thread local variable is the add of the thread
15452 // pointer with the offset of the variable.
15453 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
15457 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
15459 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
15461 if (DAG.getTarget().Options.EmulatedTLS)
15462 return LowerToTLSEmulatedModel(GA, DAG);
15464 const GlobalValue *GV = GA->getGlobal();
15465 auto PtrVT = getPointerTy(DAG.getDataLayout());
15466 bool PositionIndependent = isPositionIndependent();
15468 if (Subtarget.isTargetELF()) {
15469 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
15471 case TLSModel::GeneralDynamic:
15472 if (Subtarget.is64Bit())
15473 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
15474 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
15475 case TLSModel::LocalDynamic:
15476 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
15477 Subtarget.is64Bit());
15478 case TLSModel::InitialExec:
15479 case TLSModel::LocalExec:
15480 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
15481 PositionIndependent);
15483 llvm_unreachable("Unknown TLS model.");
15486 if (Subtarget.isTargetDarwin()) {
15487 // Darwin only has one model of TLS. Lower to that.
15488 unsigned char OpFlag = 0;
15489 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
15490 X86ISD::WrapperRIP : X86ISD::Wrapper;
15492 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15493 // global base reg.
15494 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
15496 OpFlag = X86II::MO_TLVP_PIC_BASE;
15498 OpFlag = X86II::MO_TLVP;
15500 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
15501 GA->getValueType(0),
15502 GA->getOffset(), OpFlag);
15503 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
15505 // With PIC32, the address is actually $g + Offset.
15507 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
15508 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15511 // Lowering the machine isd will make sure everything is in the right
15513 SDValue Chain = DAG.getEntryNode();
15514 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15515 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
15516 SDValue Args[] = { Chain, Offset };
15517 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
15518 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
15519 DAG.getIntPtrConstant(0, DL, true),
15520 Chain.getValue(1), DL);
15522 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
15523 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15524 MFI.setAdjustsStack(true);
15526 // And our return value (tls address) is in the standard call return value
15528 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
15529 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
15532 if (Subtarget.isTargetKnownWindowsMSVC() ||
15533 Subtarget.isTargetWindowsItanium() ||
15534 Subtarget.isTargetWindowsGNU()) {
15535 // Just use the implicit TLS architecture
15536 // Need to generate something similar to:
15537 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
15539 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
15540 // mov rcx, qword [rdx+rcx*8]
15541 // mov eax, .tls$:tlsvar
15542 // [rax+rcx] contains the address
15543 // Windows 64bit: gs:0x58
15544 // Windows 32bit: fs:__tls_array
15547 SDValue Chain = DAG.getEntryNode();
15549 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
15550 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
15551 // use its literal value of 0x2C.
15552 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
15553 ? Type::getInt8PtrTy(*DAG.getContext(),
15555 : Type::getInt32PtrTy(*DAG.getContext(),
15558 SDValue TlsArray = Subtarget.is64Bit()
15559 ? DAG.getIntPtrConstant(0x58, dl)
15560 : (Subtarget.isTargetWindowsGNU()
15561 ? DAG.getIntPtrConstant(0x2C, dl)
15562 : DAG.getExternalSymbol("_tls_array", PtrVT));
15564 SDValue ThreadPointer =
15565 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
15568 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
15569 res = ThreadPointer;
15571 // Load the _tls_index variable
15572 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
15573 if (Subtarget.is64Bit())
15574 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
15575 MachinePointerInfo(), MVT::i32);
15577 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
15579 auto &DL = DAG.getDataLayout();
15581 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
15582 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
15584 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
15587 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
15589 // Get the offset of start of .tls section
15590 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15591 GA->getValueType(0),
15592 GA->getOffset(), X86II::MO_SECREL);
15593 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
15595 // The address of the thread local variable is the add of the thread
15596 // pointer with the offset of the variable.
15597 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15600 llvm_unreachable("TLS not implemented for this target.");
15603 /// Lower SRA_PARTS and friends, which return two i32 values
15604 /// and take a 2 x i32 value to shift plus a shift amount.
15605 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15606 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
15607 MVT VT = Op.getSimpleValueType();
15608 unsigned VTBits = VT.getSizeInBits();
15610 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15611 SDValue ShOpLo = Op.getOperand(0);
15612 SDValue ShOpHi = Op.getOperand(1);
15613 SDValue ShAmt = Op.getOperand(2);
15614 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15615 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15617 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15618 DAG.getConstant(VTBits - 1, dl, MVT::i8));
15619 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15620 DAG.getConstant(VTBits - 1, dl, MVT::i8))
15621 : DAG.getConstant(0, dl, VT);
15623 SDValue Tmp2, Tmp3;
15624 if (Op.getOpcode() == ISD::SHL_PARTS) {
15625 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15626 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15628 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15629 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15632 // If the shift amount is larger or equal than the width of a part we can't
15633 // rely on the results of shld/shrd. Insert a test and select the appropriate
15634 // values for large shift amounts.
15635 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15636 DAG.getConstant(VTBits, dl, MVT::i8));
15637 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15638 AndNode, DAG.getConstant(0, dl, MVT::i8));
15641 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15642 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15643 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15645 if (Op.getOpcode() == ISD::SHL_PARTS) {
15646 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15647 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15649 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15650 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15653 SDValue Ops[2] = { Lo, Hi };
15654 return DAG.getMergeValues(Ops, dl);
15657 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15658 SelectionDAG &DAG) const {
15659 SDValue Src = Op.getOperand(0);
15660 MVT SrcVT = Src.getSimpleValueType();
15661 MVT VT = Op.getSimpleValueType();
15664 if (SrcVT.isVector()) {
15665 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
15666 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15667 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15668 DAG.getUNDEF(SrcVT)));
15670 if (SrcVT == MVT::v2i1) {
15671 // For v2i1, we need to widen to v4i1 first.
15672 assert(VT == MVT::v2f64 && "Unexpected type");
15673 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src,
15674 DAG.getUNDEF(MVT::v2i1));
15675 return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(),
15676 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src));
15681 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
15682 "Unknown SINT_TO_FP to lower!");
15684 // These are really Legal; return the operand so the caller accepts it as
15686 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
15688 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15689 Subtarget.is64Bit()) {
15693 SDValue ValueToStore = Op.getOperand(0);
15694 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15695 !Subtarget.is64Bit())
15696 // Bitcasting to f64 here allows us to do a single 64-bit store from
15697 // an SSE register, avoiding the store forwarding penalty that would come
15698 // with two 32-bit stores.
15699 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15701 unsigned Size = SrcVT.getSizeInBits()/8;
15702 MachineFunction &MF = DAG.getMachineFunction();
15703 auto PtrVT = getPointerTy(MF.getDataLayout());
15704 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15705 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15706 SDValue Chain = DAG.getStore(
15707 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15708 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15709 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15712 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15714 SelectionDAG &DAG) const {
15718 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15720 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15722 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15724 unsigned ByteSize = SrcVT.getSizeInBits()/8;
15726 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15727 MachineMemOperand *MMO;
15729 int SSFI = FI->getIndex();
15730 MMO = DAG.getMachineFunction().getMachineMemOperand(
15731 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15732 MachineMemOperand::MOLoad, ByteSize, ByteSize);
15734 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15735 StackSlot = StackSlot.getOperand(1);
15737 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15738 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15740 Tys, Ops, SrcVT, MMO);
15743 Chain = Result.getValue(1);
15744 SDValue InFlag = Result.getValue(2);
15746 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15747 // shouldn't be necessary except that RFP cannot be live across
15748 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15749 MachineFunction &MF = DAG.getMachineFunction();
15750 unsigned SSFISize = Op.getValueSizeInBits()/8;
15751 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15752 auto PtrVT = getPointerTy(MF.getDataLayout());
15753 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15754 Tys = DAG.getVTList(MVT::Other);
15756 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15758 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15759 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15760 MachineMemOperand::MOStore, SSFISize, SSFISize);
15762 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15763 Ops, Op.getValueType(), MMO);
15764 Result = DAG.getLoad(
15765 Op.getValueType(), DL, Chain, StackSlot,
15766 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15772 /// 64-bit unsigned integer to double expansion.
15773 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
15774 const X86Subtarget &Subtarget) {
15775 // This algorithm is not obvious. Here it is what we're trying to output:
15778 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15779 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15781 haddpd %xmm0, %xmm0
15783 pshufd $0x4e, %xmm0, %xmm1
15789 LLVMContext *Context = DAG.getContext();
15791 // Build some magic constants.
15792 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15793 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15794 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
15795 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15797 SmallVector<Constant*,2> CV1;
15799 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15800 APInt(64, 0x4330000000000000ULL))));
15802 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15803 APInt(64, 0x4530000000000000ULL))));
15804 Constant *C1 = ConstantVector::get(CV1);
15805 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15807 // Load the 64-bit value into an XMM register.
15808 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15811 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15812 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15813 /* Alignment = */ 16);
15815 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15818 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15819 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15820 /* Alignment = */ 16);
15821 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15822 // TODO: Are there any fast-math-flags to propagate here?
15823 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15826 if (Subtarget.hasSSE3()) {
15827 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15828 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15830 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15831 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15832 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15833 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15836 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15837 DAG.getIntPtrConstant(0, dl));
15840 /// 32-bit unsigned integer to float expansion.
15841 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
15842 const X86Subtarget &Subtarget) {
15844 // FP constant to bias correct the final result.
15845 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15848 // Load the 32-bit value into an XMM register.
15849 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15852 // Zero out the upper parts of the register.
15853 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15855 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15856 DAG.getBitcast(MVT::v2f64, Load),
15857 DAG.getIntPtrConstant(0, dl));
15859 // Or the load with the bias.
15860 SDValue Or = DAG.getNode(
15861 ISD::OR, dl, MVT::v2i64,
15862 DAG.getBitcast(MVT::v2i64,
15863 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15864 DAG.getBitcast(MVT::v2i64,
15865 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15867 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15868 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15870 // Subtract the bias.
15871 // TODO: Are there any fast-math-flags to propagate here?
15872 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15874 // Handle final rounding.
15875 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
15878 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15879 const X86Subtarget &Subtarget, SDLoc &DL) {
15880 if (Op.getSimpleValueType() != MVT::v2f64)
15883 SDValue N0 = Op.getOperand(0);
15884 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15886 // Legalize to v4i32 type.
15887 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15888 DAG.getUNDEF(MVT::v2i32));
15890 if (Subtarget.hasAVX512())
15891 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15893 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15894 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15895 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15896 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15898 // Two to the power of half-word-size.
15899 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15901 // Clear upper part of LO, lower HI.
15902 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15903 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15905 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15906 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15907 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15909 // Add the two halves.
15910 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15913 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15914 const X86Subtarget &Subtarget) {
15915 // The algorithm is the following:
15916 // #ifdef __SSE4_1__
15917 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15918 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15919 // (uint4) 0x53000000, 0xaa);
15921 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15922 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15924 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15925 // return (float4) lo + fhi;
15927 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15928 // reassociate the two FADDs, and if we do that, the algorithm fails
15929 // spectacularly (PR24512).
15930 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15931 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15932 // there's also the MachineCombiner reassociations happening on Machine IR.
15933 if (DAG.getTarget().Options.UnsafeFPMath)
15937 SDValue V = Op->getOperand(0);
15938 MVT VecIntVT = V.getSimpleValueType();
15939 bool Is128 = VecIntVT == MVT::v4i32;
15940 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15941 // If we convert to something else than the supported type, e.g., to v4f64,
15943 if (VecFloatVT != Op->getSimpleValueType(0))
15946 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15947 "Unsupported custom type");
15949 // In the #idef/#else code, we have in common:
15950 // - The vector of constants:
15956 // Create the splat vector for 0x4b000000.
15957 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15958 // Create the splat vector for 0x53000000.
15959 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15961 // Create the right shift.
15962 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15963 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15966 if (Subtarget.hasSSE41()) {
15967 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15968 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15969 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15970 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15971 // Low will be bitcasted right away, so do not bother bitcasting back to its
15973 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15974 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15975 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15976 // (uint4) 0x53000000, 0xaa);
15977 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15978 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15979 // High will be bitcasted right away, so do not bother bitcasting back to
15980 // its original type.
15981 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15982 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15984 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15985 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15986 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15987 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15989 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15990 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15993 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15994 SDValue VecCstFAdd = DAG.getConstantFP(
15995 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15997 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15998 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15999 // TODO: Are there any fast-math-flags to propagate here?
16001 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
16002 // return (float4) lo + fhi;
16003 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
16004 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
16007 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
16008 const X86Subtarget &Subtarget) {
16009 SDValue N0 = Op.getOperand(0);
16010 MVT SrcVT = N0.getSimpleValueType();
16013 if (SrcVT == MVT::v2i1) {
16014 // For v2i1, we need to widen to v4i1 first.
16015 assert(Op.getValueType() == MVT::v2f64 && "Unexpected type");
16016 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0,
16017 DAG.getUNDEF(MVT::v2i1));
16018 return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64,
16019 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0));
16022 switch (SrcVT.SimpleTy) {
16024 llvm_unreachable("Custom UINT_TO_FP is not supported!");
16026 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
16029 assert(!Subtarget.hasAVX512());
16030 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
16034 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
16035 SelectionDAG &DAG) const {
16036 SDValue N0 = Op.getOperand(0);
16038 auto PtrVT = getPointerTy(DAG.getDataLayout());
16040 if (Op.getSimpleValueType().isVector())
16041 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
16043 MVT SrcVT = N0.getSimpleValueType();
16044 MVT DstVT = Op.getSimpleValueType();
16046 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
16047 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
16048 // Conversions from unsigned i32 to f32/f64 are legal,
16049 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
16053 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
16054 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
16055 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
16056 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
16057 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
16060 // Make a 64-bit buffer, and use it to build an FILD.
16061 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
16062 if (SrcVT == MVT::i32) {
16063 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
16064 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
16065 StackSlot, MachinePointerInfo());
16066 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
16067 OffsetSlot, MachinePointerInfo());
16068 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
16072 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
16073 SDValue ValueToStore = Op.getOperand(0);
16074 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
16075 // Bitcasting to f64 here allows us to do a single 64-bit store from
16076 // an SSE register, avoiding the store forwarding penalty that would come
16077 // with two 32-bit stores.
16078 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16079 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16080 MachinePointerInfo());
16081 // For i64 source, we need to add the appropriate power of 2 if the input
16082 // was negative. This is the same as the optimization in
16083 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
16084 // we must be careful to do the computation in x87 extended precision, not
16085 // in SSE. (The generic code can't know it's OK to do this, or how to.)
16086 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
16087 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16088 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16089 MachineMemOperand::MOLoad, 8, 8);
16091 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
16092 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
16093 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
16096 APInt FF(32, 0x5F800000ULL);
16098 // Check whether the sign bit is set.
16099 SDValue SignSet = DAG.getSetCC(
16100 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
16101 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
16103 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
16104 SDValue FudgePtr = DAG.getConstantPool(
16105 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
16107 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
16108 SDValue Zero = DAG.getIntPtrConstant(0, dl);
16109 SDValue Four = DAG.getIntPtrConstant(4, dl);
16110 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
16111 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
16113 // Load the value out, extending it from f32 to f80.
16114 // FIXME: Avoid the extend by constructing the right constant pool?
16115 SDValue Fudge = DAG.getExtLoad(
16116 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
16117 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
16118 /* Alignment = */ 4);
16119 // Extend everything to 80 bits to force it to be done on x87.
16120 // TODO: Are there any fast-math-flags to propagate here?
16121 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
16122 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
16123 DAG.getIntPtrConstant(0, dl));
16126 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
16127 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
16128 // just return an <SDValue(), SDValue()> pair.
16129 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
16130 // to i16, i32 or i64, and we lower it to a legal sequence.
16131 // If lowered to the final integer result we return a <result, SDValue()> pair.
16132 // Otherwise we lower it to a sequence ending with a FIST, return a
16133 // <FIST, StackSlot> pair, and the caller is responsible for loading
16134 // the final integer result from StackSlot.
16135 std::pair<SDValue,SDValue>
16136 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
16137 bool IsSigned, bool IsReplace) const {
16140 EVT DstTy = Op.getValueType();
16141 EVT TheVT = Op.getOperand(0).getValueType();
16142 auto PtrVT = getPointerTy(DAG.getDataLayout());
16144 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
16145 // f16 must be promoted before using the lowering in this routine.
16146 // fp128 does not use this lowering.
16147 return std::make_pair(SDValue(), SDValue());
16150 // If using FIST to compute an unsigned i64, we'll need some fixup
16151 // to handle values above the maximum signed i64. A FIST is always
16152 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
16153 bool UnsignedFixup = !IsSigned &&
16154 DstTy == MVT::i64 &&
16155 (!Subtarget.is64Bit() ||
16156 !isScalarFPTypeInSSEReg(TheVT));
16158 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
16159 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
16160 // The low 32 bits of the fist result will have the correct uint32 result.
16161 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
16165 assert(DstTy.getSimpleVT() <= MVT::i64 &&
16166 DstTy.getSimpleVT() >= MVT::i16 &&
16167 "Unknown FP_TO_INT to lower!");
16169 // These are really Legal.
16170 if (DstTy == MVT::i32 &&
16171 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16172 return std::make_pair(SDValue(), SDValue());
16173 if (Subtarget.is64Bit() &&
16174 DstTy == MVT::i64 &&
16175 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16176 return std::make_pair(SDValue(), SDValue());
16178 // We lower FP->int64 into FISTP64 followed by a load from a temporary
16180 MachineFunction &MF = DAG.getMachineFunction();
16181 unsigned MemSize = DstTy.getSizeInBits()/8;
16182 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16183 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16186 switch (DstTy.getSimpleVT().SimpleTy) {
16187 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
16188 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
16189 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
16190 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
16193 SDValue Chain = DAG.getEntryNode();
16194 SDValue Value = Op.getOperand(0);
16195 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
16197 if (UnsignedFixup) {
16199 // Conversion to unsigned i64 is implemented with a select,
16200 // depending on whether the source value fits in the range
16201 // of a signed i64. Let Thresh be the FP equivalent of
16202 // 0x8000000000000000ULL.
16204 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
16205 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
16206 // Fist-to-mem64 FistSrc
16207 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
16208 // to XOR'ing the high 32 bits with Adjust.
16210 // Being a power of 2, Thresh is exactly representable in all FP formats.
16211 // For X87 we'd like to use the smallest FP type for this constant, but
16212 // for DAG type consistency we have to match the FP operand type.
16214 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
16215 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
16216 bool LosesInfo = false;
16217 if (TheVT == MVT::f64)
16218 // The rounding mode is irrelevant as the conversion should be exact.
16219 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
16221 else if (TheVT == MVT::f80)
16222 Status = Thresh.convert(APFloat::x87DoubleExtended(),
16223 APFloat::rmNearestTiesToEven, &LosesInfo);
16225 assert(Status == APFloat::opOK && !LosesInfo &&
16226 "FP conversion should have been exact");
16228 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
16230 SDValue Cmp = DAG.getSetCC(DL,
16231 getSetCCResultType(DAG.getDataLayout(),
16232 *DAG.getContext(), TheVT),
16233 Value, ThreshVal, ISD::SETLT);
16234 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
16235 DAG.getConstant(0, DL, MVT::i32),
16236 DAG.getConstant(0x80000000, DL, MVT::i32));
16237 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
16238 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
16239 *DAG.getContext(), TheVT),
16240 Value, ThreshVal, ISD::SETLT);
16241 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
16244 // FIXME This causes a redundant load/store if the SSE-class value is already
16245 // in memory, such as if it is on the callstack.
16246 if (isScalarFPTypeInSSEReg(TheVT)) {
16247 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
16248 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
16249 MachinePointerInfo::getFixedStack(MF, SSFI));
16250 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
16252 Chain, StackSlot, DAG.getValueType(TheVT)
16255 MachineMemOperand *MMO =
16256 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16257 MachineMemOperand::MOLoad, MemSize, MemSize);
16258 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
16259 Chain = Value.getValue(1);
16260 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16261 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16264 MachineMemOperand *MMO =
16265 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16266 MachineMemOperand::MOStore, MemSize, MemSize);
16268 if (UnsignedFixup) {
16270 // Insert the FIST, load its result as two i32's,
16271 // and XOR the high i32 with Adjust.
16273 SDValue FistOps[] = { Chain, Value, StackSlot };
16274 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16275 FistOps, DstTy, MMO);
16278 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
16279 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
16282 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
16283 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
16285 if (Subtarget.is64Bit()) {
16286 // Join High32 and Low32 into a 64-bit result.
16287 // (High32 << 32) | Low32
16288 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
16289 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
16290 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
16291 DAG.getConstant(32, DL, MVT::i8));
16292 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
16293 return std::make_pair(Result, SDValue());
16296 SDValue ResultOps[] = { Low32, High32 };
16298 SDValue pair = IsReplace
16299 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
16300 : DAG.getMergeValues(ResultOps, DL);
16301 return std::make_pair(pair, SDValue());
16303 // Build the FP_TO_INT*_IN_MEM
16304 SDValue Ops[] = { Chain, Value, StackSlot };
16305 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16307 return std::make_pair(FIST, StackSlot);
16311 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
16312 const X86Subtarget &Subtarget) {
16313 MVT VT = Op->getSimpleValueType(0);
16314 SDValue In = Op->getOperand(0);
16315 MVT InVT = In.getSimpleValueType();
16318 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16319 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16320 (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
16321 (VT != MVT::v8i64 || InVT != MVT::v8i32) &&
16322 (VT != MVT::v8i64 || InVT != MVT::v8i16) &&
16323 (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
16324 (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
16325 (VT != MVT::v32i16 || InVT != MVT::v32i8))
16328 if (Subtarget.hasInt256())
16329 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
16331 // Optimize vectors in AVX mode:
16334 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
16335 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
16336 // Concat upper and lower parts.
16339 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
16340 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
16341 // Concat upper and lower parts.
16344 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
16345 SDValue Undef = DAG.getUNDEF(InVT);
16346 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
16347 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16348 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16350 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
16351 VT.getVectorNumElements()/2);
16353 OpLo = DAG.getBitcast(HVT, OpLo);
16354 OpHi = DAG.getBitcast(HVT, OpHi);
16356 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16359 static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
16360 const X86Subtarget &Subtarget,
16361 SelectionDAG &DAG) {
16362 MVT VT = Op->getSimpleValueType(0);
16363 SDValue In = Op->getOperand(0);
16364 MVT InVT = In.getSimpleValueType();
16365 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
16367 unsigned NumElts = VT.getVectorNumElements();
16369 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
16371 if (!Subtarget.hasBWI() &&
16372 (VT.getVectorElementType().getSizeInBits() <= 16))
16373 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
16375 // Widen to 512-bits if VLX is not supported.
16376 MVT WideVT = ExtVT;
16377 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
16378 NumElts *= 512 / ExtVT.getSizeInBits();
16379 InVT = MVT::getVectorVT(MVT::i1, NumElts);
16380 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
16381 In, DAG.getIntPtrConstant(0, DL));
16382 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
16386 SDValue One = DAG.getConstant(1, DL, WideVT);
16387 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
16389 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
16391 // Truncate if we had to extend i16/i8 above.
16393 WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16394 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
16397 // Extract back to 128/256-bit if we widened.
16399 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
16400 DAG.getIntPtrConstant(0, DL));
16402 return SelectedVal;
16405 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16406 SelectionDAG &DAG) {
16407 SDValue In = Op.getOperand(0);
16408 MVT SVT = In.getSimpleValueType();
16410 if (SVT.getVectorElementType() == MVT::i1)
16411 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
16413 if (Subtarget.hasFp256())
16414 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
16417 assert(!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() ||
16418 Op.getSimpleValueType().getVectorNumElements() !=
16419 SVT.getVectorNumElements());
16423 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
16424 /// It makes use of the fact that vectors with enough leading sign/zero bits
16425 /// prevent the PACKSS/PACKUS from saturating the results.
16426 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
16427 /// within each 128-bit lane.
16428 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
16429 const SDLoc &DL, SelectionDAG &DAG,
16430 const X86Subtarget &Subtarget) {
16431 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
16432 "Unexpected PACK opcode");
16434 // Requires SSE2 but AVX512 has fast truncate.
16435 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
16438 EVT SrcVT = In.getValueType();
16440 // No truncation required, we might get here due to recursive calls.
16441 if (SrcVT == DstVT)
16444 // We only support vector truncation to 128bits or greater from a
16445 // 256bits or greater source.
16446 unsigned DstSizeInBits = DstVT.getSizeInBits();
16447 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
16448 if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
16451 LLVMContext &Ctx = *DAG.getContext();
16452 unsigned NumElems = SrcVT.getVectorNumElements();
16453 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
16454 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
16456 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
16458 // Extract lower/upper subvectors.
16459 unsigned NumSubElts = NumElems / 2;
16460 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16461 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16463 // Pack to the largest type possible:
16464 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
16465 EVT InVT = MVT::i16, OutVT = MVT::i8;
16466 if (DstVT.getScalarSizeInBits() > 8 &&
16467 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
16472 unsigned SubSizeInBits = SrcSizeInBits / 2;
16473 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
16474 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
16476 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
16477 if (SrcVT.is256BitVector()) {
16478 Lo = DAG.getBitcast(InVT, Lo);
16479 Hi = DAG.getBitcast(InVT, Hi);
16480 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16481 return DAG.getBitcast(DstVT, Res);
16484 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
16485 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
16486 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
16487 Lo = DAG.getBitcast(InVT, Lo);
16488 Hi = DAG.getBitcast(InVT, Hi);
16489 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16491 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
16492 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
16493 Res = DAG.getBitcast(MVT::v4i64, Res);
16494 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
16496 if (DstVT.is256BitVector())
16497 return DAG.getBitcast(DstVT, Res);
16499 // If 512bit -> 128bit truncate another stage.
16500 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16501 Res = DAG.getBitcast(PackedVT, Res);
16502 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16505 // Recursively pack lower/upper subvectors, concat result and pack again.
16506 assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
16507 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
16508 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
16509 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
16511 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16512 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
16513 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16516 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
16517 const X86Subtarget &Subtarget) {
16520 MVT VT = Op.getSimpleValueType();
16521 SDValue In = Op.getOperand(0);
16522 MVT InVT = In.getSimpleValueType();
16524 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
16526 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
16527 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
16528 if (InVT.getScalarSizeInBits() <= 16) {
16529 if (Subtarget.hasBWI()) {
16530 // legal, will go to VPMOVB2M, VPMOVW2M
16531 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
16532 // We need to shift to get the lsb into sign position.
16533 // Shift packed bytes not supported natively, bitcast to word
16534 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
16535 In = DAG.getNode(ISD::SHL, DL, ExtVT,
16536 DAG.getBitcast(ExtVT, In),
16537 DAG.getConstant(ShiftInx, DL, ExtVT));
16538 In = DAG.getBitcast(InVT, In);
16540 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, In);
16542 // Use TESTD/Q, extended vector to packed dword/qword.
16543 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
16544 "Unexpected vector type.");
16545 unsigned NumElts = InVT.getVectorNumElements();
16546 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
16547 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
16548 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
16550 ShiftInx = InVT.getScalarSizeInBits() - 1;
16553 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
16554 // We need to shift to get the lsb into sign position.
16555 In = DAG.getNode(ISD::SHL, DL, InVT, In,
16556 DAG.getConstant(ShiftInx, DL, InVT));
16558 return DAG.getNode(X86ISD::TESTM, DL, VT, In, In);
16561 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
16563 MVT VT = Op.getSimpleValueType();
16564 SDValue In = Op.getOperand(0);
16565 MVT InVT = In.getSimpleValueType();
16566 unsigned InNumEltBits = InVT.getScalarSizeInBits();
16568 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
16569 "Invalid TRUNCATE operation");
16571 if (VT.getVectorElementType() == MVT::i1)
16572 return LowerTruncateVecI1(Op, DAG, Subtarget);
16574 // vpmovqb/w/d, vpmovdb/w, vpmovwb
16575 if (Subtarget.hasAVX512()) {
16576 // word to byte only under BWI
16577 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
16578 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
16579 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
16580 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
16583 // Truncate with PACKSS if we are truncating a vector with sign-bits that
16584 // extend all the way to the packed/truncated value.
16585 unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
16586 if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
16588 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
16591 // Truncate with PACKUS if we are truncating a vector with leading zero bits
16592 // that extend all the way to the packed/truncated value.
16593 // Pre-SSE41 we can only use PACKUSWB.
16595 DAG.computeKnownBits(In, Known);
16596 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
16597 if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
16599 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
16602 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16603 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
16604 if (Subtarget.hasInt256()) {
16605 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
16606 In = DAG.getBitcast(MVT::v8i32, In);
16607 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
16608 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
16609 DAG.getIntPtrConstant(0, DL));
16612 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16613 DAG.getIntPtrConstant(0, DL));
16614 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16615 DAG.getIntPtrConstant(2, DL));
16616 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16617 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16618 static const int ShufMask[] = {0, 2, 4, 6};
16619 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16622 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
16623 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16624 if (Subtarget.hasInt256()) {
16625 In = DAG.getBitcast(MVT::v32i8, In);
16627 // The PSHUFB mask:
16628 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
16629 -1, -1, -1, -1, -1, -1, -1, -1,
16630 16, 17, 20, 21, 24, 25, 28, 29,
16631 -1, -1, -1, -1, -1, -1, -1, -1 };
16632 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16633 In = DAG.getBitcast(MVT::v4i64, In);
16635 static const int ShufMask2[] = {0, 2, -1, -1};
16636 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
16637 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16638 DAG.getIntPtrConstant(0, DL));
16639 return DAG.getBitcast(VT, In);
16642 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16643 DAG.getIntPtrConstant(0, DL));
16645 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16646 DAG.getIntPtrConstant(4, DL));
16648 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16649 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16651 // The PSHUFB mask:
16652 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
16653 -1, -1, -1, -1, -1, -1, -1, -1};
16655 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16656 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16658 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16659 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16661 // The MOVLHPS Mask:
16662 static const int ShufMask2[] = {0, 1, 4, 5};
16663 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16664 return DAG.getBitcast(MVT::v8i16, res);
16667 // Handle truncation of V256 to V128 using shuffles.
16668 if (!VT.is128BitVector() || !InVT.is256BitVector())
16671 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
16673 unsigned NumElems = VT.getVectorNumElements();
16674 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16676 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16677 // Prepare truncation shuffle mask
16678 for (unsigned i = 0; i != NumElems; ++i)
16679 MaskVec[i] = i * 2;
16680 In = DAG.getBitcast(NVT, In);
16681 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16682 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16683 DAG.getIntPtrConstant(0, DL));
16686 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16687 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16688 MVT VT = Op.getSimpleValueType();
16690 if (VT.isVector()) {
16691 SDValue Src = Op.getOperand(0);
16694 if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
16695 MVT ResVT = MVT::v4i32;
16696 MVT TruncVT = MVT::v4i1;
16697 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
16698 if (!IsSigned && !Subtarget.hasVLX()) {
16699 // Widen to 512-bits.
16700 ResVT = MVT::v8i32;
16701 TruncVT = MVT::v8i1;
16702 Opc = ISD::FP_TO_UINT;
16703 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
16704 DAG.getUNDEF(MVT::v8f64),
16705 Src, DAG.getIntPtrConstant(0, dl));
16707 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
16708 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
16709 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
16710 DAG.getIntPtrConstant(0, dl));
16713 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
16714 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
16715 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
16716 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16717 DAG.getUNDEF(MVT::v2f32)));
16723 assert(!VT.isVector());
16725 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16726 IsSigned, /*IsReplace=*/ false);
16727 SDValue FIST = Vals.first, StackSlot = Vals.second;
16728 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16729 if (!FIST.getNode())
16732 if (StackSlot.getNode())
16733 // Load the result.
16734 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16736 // The node is the result.
16740 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16742 MVT VT = Op.getSimpleValueType();
16743 SDValue In = Op.getOperand(0);
16744 MVT SVT = In.getSimpleValueType();
16746 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16748 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16749 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16750 In, DAG.getUNDEF(SVT)));
16753 /// The only differences between FABS and FNEG are the mask and the logic op.
16754 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
16755 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16756 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16757 "Wrong opcode for lowering FABS or FNEG.");
16759 bool IsFABS = (Op.getOpcode() == ISD::FABS);
16761 // If this is a FABS and it has an FNEG user, bail out to fold the combination
16762 // into an FNABS. We'll lower the FABS after that if it is still in use.
16764 for (SDNode *User : Op->uses())
16765 if (User->getOpcode() == ISD::FNEG)
16769 MVT VT = Op.getSimpleValueType();
16771 bool IsF128 = (VT == MVT::f128);
16773 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16774 // decide if we should generate a 16-byte constant mask when we only need 4 or
16775 // 8 bytes for the scalar case.
16780 if (VT.isVector()) {
16782 EltVT = VT.getVectorElementType();
16783 } else if (IsF128) {
16784 // SSE instructions are used for optimized f128 logical operations.
16785 LogicVT = MVT::f128;
16788 // There are no scalar bitwise logical SSE/AVX instructions, so we
16789 // generate a 16-byte vector constant and logic op even for the scalar case.
16790 // Using a 16-byte mask allows folding the load of the mask with
16791 // the logic op, so it can save (~4 bytes) on code size.
16792 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16796 unsigned EltBits = EltVT.getSizeInBits();
16797 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16799 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16800 const fltSemantics &Sem =
16801 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16802 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16803 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16805 SDValue Op0 = Op.getOperand(0);
16806 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16808 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16809 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16811 if (VT.isVector() || IsF128)
16812 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16814 // For the scalar case extend to a 128-bit vector, perform the logic op,
16815 // and extract the scalar result back out.
16816 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16817 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16818 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16819 DAG.getIntPtrConstant(0, dl));
16822 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16823 SDValue Mag = Op.getOperand(0);
16824 SDValue Sign = Op.getOperand(1);
16827 // If the sign operand is smaller, extend it first.
16828 MVT VT = Op.getSimpleValueType();
16829 if (Sign.getSimpleValueType().bitsLT(VT))
16830 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16832 // And if it is bigger, shrink it first.
16833 if (Sign.getSimpleValueType().bitsGT(VT))
16834 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16836 // At this point the operands and the result should have the same
16837 // type, and that won't be f80 since that is not custom lowered.
16838 bool IsF128 = (VT == MVT::f128);
16839 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16840 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16841 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16842 "Unexpected type in LowerFCOPYSIGN");
16844 MVT EltVT = VT.getScalarType();
16845 const fltSemantics &Sem =
16846 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16847 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16849 // Perform all scalar logic operations as 16-byte vectors because there are no
16850 // scalar FP logic instructions in SSE.
16851 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16852 // unnecessary splats, but we might miss load folding opportunities. Should
16853 // this decision be based on OptimizeForSize?
16854 bool IsFakeVector = !VT.isVector() && !IsF128;
16857 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16859 // The mask constants are automatically splatted for vector types.
16860 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16861 SDValue SignMask = DAG.getConstantFP(
16862 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16863 SDValue MagMask = DAG.getConstantFP(
16864 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16866 // First, clear all bits but the sign bit from the second operand (sign).
16868 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16869 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16871 // Next, clear the sign bit from the first operand (magnitude).
16872 // TODO: If we had general constant folding for FP logic ops, this check
16873 // wouldn't be necessary.
16875 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16876 APFloat APF = Op0CN->getValueAPF();
16878 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16880 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16882 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16883 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16886 // OR the magnitude value with the sign bit.
16887 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16888 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16889 DAG.getIntPtrConstant(0, dl));
16892 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16893 SDValue N0 = Op.getOperand(0);
16895 MVT VT = Op.getSimpleValueType();
16897 MVT OpVT = N0.getSimpleValueType();
16898 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16899 "Unexpected type for FGETSIGN");
16901 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16902 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16903 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16904 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16905 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16906 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16910 // Check whether an OR'd tree is PTEST-able.
16911 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16912 SelectionDAG &DAG) {
16913 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16915 if (!Subtarget.hasSSE41())
16918 if (!Op->hasOneUse())
16921 SDNode *N = Op.getNode();
16924 SmallVector<SDValue, 8> Opnds;
16925 DenseMap<SDValue, unsigned> VecInMap;
16926 SmallVector<SDValue, 8> VecIns;
16927 EVT VT = MVT::Other;
16929 // Recognize a special case where a vector is casted into wide integer to
16931 Opnds.push_back(N->getOperand(0));
16932 Opnds.push_back(N->getOperand(1));
16934 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16935 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16936 // BFS traverse all OR'd operands.
16937 if (I->getOpcode() == ISD::OR) {
16938 Opnds.push_back(I->getOperand(0));
16939 Opnds.push_back(I->getOperand(1));
16940 // Re-evaluate the number of nodes to be traversed.
16941 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16945 // Quit if a non-EXTRACT_VECTOR_ELT
16946 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16949 // Quit if without a constant index.
16950 SDValue Idx = I->getOperand(1);
16951 if (!isa<ConstantSDNode>(Idx))
16954 SDValue ExtractedFromVec = I->getOperand(0);
16955 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16956 if (M == VecInMap.end()) {
16957 VT = ExtractedFromVec.getValueType();
16958 // Quit if not 128/256-bit vector.
16959 if (!VT.is128BitVector() && !VT.is256BitVector())
16961 // Quit if not the same type.
16962 if (VecInMap.begin() != VecInMap.end() &&
16963 VT != VecInMap.begin()->first.getValueType())
16965 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16966 VecIns.push_back(ExtractedFromVec);
16968 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16971 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16972 "Not extracted from 128-/256-bit vector.");
16974 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16976 for (DenseMap<SDValue, unsigned>::const_iterator
16977 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16978 // Quit if not all elements are used.
16979 if (I->second != FullMask)
16983 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16985 // Cast all vectors into TestVT for PTEST.
16986 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16987 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16989 // If more than one full vector is evaluated, OR them first before PTEST.
16990 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16991 // Each iteration will OR 2 nodes and append the result until there is only
16992 // 1 node left, i.e. the final OR'd value of all vectors.
16993 SDValue LHS = VecIns[Slot];
16994 SDValue RHS = VecIns[Slot + 1];
16995 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16998 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
17001 /// \brief return true if \c Op has a use that doesn't just read flags.
17002 static bool hasNonFlagsUse(SDValue Op) {
17003 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
17005 SDNode *User = *UI;
17006 unsigned UOpNo = UI.getOperandNo();
17007 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
17008 // Look pass truncate.
17009 UOpNo = User->use_begin().getOperandNo();
17010 User = *User->use_begin();
17013 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
17014 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
17020 /// Emit nodes that will be selected as "test Op0,Op0", or something
17022 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
17023 SelectionDAG &DAG) const {
17024 if (Op.getValueType() == MVT::i1) {
17025 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
17026 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
17027 DAG.getConstant(0, dl, MVT::i8));
17029 // CF and OF aren't always set the way we want. Determine which
17030 // of these we need.
17031 bool NeedCF = false;
17032 bool NeedOF = false;
17035 case X86::COND_A: case X86::COND_AE:
17036 case X86::COND_B: case X86::COND_BE:
17039 case X86::COND_G: case X86::COND_GE:
17040 case X86::COND_L: case X86::COND_LE:
17041 case X86::COND_O: case X86::COND_NO: {
17042 // Check if we really need to set the
17043 // Overflow flag. If NoSignedWrap is present
17044 // that is not actually needed.
17045 switch (Op->getOpcode()) {
17050 if (Op.getNode()->getFlags().hasNoSignedWrap())
17060 // See if we can use the EFLAGS value from the operand instead of
17061 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
17062 // we prove that the arithmetic won't overflow, we can't use OF or CF.
17063 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
17064 // Emit a CMP with 0, which is the TEST pattern.
17065 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17066 DAG.getConstant(0, dl, Op.getValueType()));
17068 unsigned Opcode = 0;
17069 unsigned NumOperands = 0;
17071 // Truncate operations may prevent the merge of the SETCC instruction
17072 // and the arithmetic instruction before it. Attempt to truncate the operands
17073 // of the arithmetic instruction and use a reduced bit-width instruction.
17074 bool NeedTruncation = false;
17075 SDValue ArithOp = Op;
17076 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
17077 SDValue Arith = Op->getOperand(0);
17078 // Both the trunc and the arithmetic op need to have one user each.
17079 if (Arith->hasOneUse())
17080 switch (Arith.getOpcode()) {
17087 NeedTruncation = true;
17093 // Sometimes flags can be set either with an AND or with an SRL/SHL
17094 // instruction. SRL/SHL variant should be preferred for masks longer than this
17096 const int ShiftToAndMaxMaskWidth = 32;
17097 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
17099 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
17100 // which may be the result of a CAST. We use the variable 'Op', which is the
17101 // non-casted variable when we check for possible users.
17102 switch (ArithOp.getOpcode()) {
17104 // We only want to rewrite this as a target-specific node with attached
17105 // flags if there is a reasonable chance of either using that to do custom
17106 // instructions selection that can fold some of the memory operands, or if
17107 // only the flags are used. If there are other uses, leave the node alone
17108 // and emit a test instruction.
17109 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17110 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17111 if (UI->getOpcode() != ISD::CopyToReg &&
17112 UI->getOpcode() != ISD::SETCC &&
17113 UI->getOpcode() != ISD::STORE)
17116 if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
17117 // An add of one will be selected as an INC.
17119 (!Subtarget.slowIncDec() ||
17120 DAG.getMachineFunction().getFunction().optForSize())) {
17121 Opcode = X86ISD::INC;
17126 // An add of negative one (subtract of one) will be selected as a DEC.
17127 if (C->isAllOnesValue() &&
17128 (!Subtarget.slowIncDec() ||
17129 DAG.getMachineFunction().getFunction().optForSize())) {
17130 Opcode = X86ISD::DEC;
17136 // Otherwise use a regular EFLAGS-setting add.
17137 Opcode = X86ISD::ADD;
17142 // If we have a constant logical shift that's only used in a comparison
17143 // against zero turn it into an equivalent AND. This allows turning it into
17144 // a TEST instruction later.
17145 if (ZeroCheck && Op->hasOneUse() &&
17146 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
17147 EVT VT = Op.getValueType();
17148 unsigned BitWidth = VT.getSizeInBits();
17149 unsigned ShAmt = Op->getConstantOperandVal(1);
17150 if (ShAmt >= BitWidth) // Avoid undefined shifts.
17152 APInt Mask = ArithOp.getOpcode() == ISD::SRL
17153 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
17154 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
17155 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17157 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
17158 DAG.getConstant(Mask, dl, VT));
17163 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
17164 // because a TEST instruction will be better. However, AND should be
17165 // preferred if the instruction can be combined into ANDN.
17166 if (!hasNonFlagsUse(Op)) {
17167 SDValue Op0 = ArithOp->getOperand(0);
17168 SDValue Op1 = ArithOp->getOperand(1);
17169 EVT VT = ArithOp.getValueType();
17170 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
17171 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
17172 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
17174 // If we cannot select an ANDN instruction, check if we can replace
17175 // AND+IMM64 with a shift before giving up. This is possible for masks
17176 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
17177 if (!isProperAndn) {
17181 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
17182 auto *CN = dyn_cast<ConstantSDNode>(Op1);
17186 const APInt &Mask = CN->getAPIntValue();
17187 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17188 break; // Prefer TEST instruction.
17190 unsigned BitWidth = Mask.getBitWidth();
17191 unsigned LeadingOnes = Mask.countLeadingOnes();
17192 unsigned TrailingZeros = Mask.countTrailingZeros();
17194 if (LeadingOnes + TrailingZeros == BitWidth) {
17195 assert(TrailingZeros < VT.getSizeInBits() &&
17196 "Shift amount should be less than the type width");
17197 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17198 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
17199 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
17203 unsigned LeadingZeros = Mask.countLeadingZeros();
17204 unsigned TrailingOnes = Mask.countTrailingOnes();
17206 if (LeadingZeros + TrailingOnes == BitWidth) {
17207 assert(LeadingZeros < VT.getSizeInBits() &&
17208 "Shift amount should be less than the type width");
17209 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17210 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
17211 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
17222 // Similar to ISD::ADD above, check if the uses will preclude useful
17223 // lowering of the target-specific node.
17224 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17225 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17226 if (UI->getOpcode() != ISD::CopyToReg &&
17227 UI->getOpcode() != ISD::SETCC &&
17228 UI->getOpcode() != ISD::STORE)
17231 // Otherwise use a regular EFLAGS-setting instruction.
17232 switch (ArithOp.getOpcode()) {
17233 default: llvm_unreachable("unexpected operator!");
17234 case ISD::SUB: Opcode = X86ISD::SUB; break;
17235 case ISD::XOR: Opcode = X86ISD::XOR; break;
17236 case ISD::AND: Opcode = X86ISD::AND; break;
17238 if (!NeedTruncation && ZeroCheck) {
17239 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
17242 Opcode = X86ISD::OR;
17256 return SDValue(Op.getNode(), 1);
17262 // If we found that truncation is beneficial, perform the truncation and
17264 if (NeedTruncation) {
17265 EVT VT = Op.getValueType();
17266 SDValue WideVal = Op->getOperand(0);
17267 EVT WideVT = WideVal.getValueType();
17268 unsigned ConvertedOp = 0;
17269 // Use a target machine opcode to prevent further DAGCombine
17270 // optimizations that may separate the arithmetic operations
17271 // from the setcc node.
17272 switch (WideVal.getOpcode()) {
17274 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
17275 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
17276 case ISD::AND: ConvertedOp = X86ISD::AND; break;
17277 case ISD::OR: ConvertedOp = X86ISD::OR; break;
17278 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
17282 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17283 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
17284 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
17285 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
17286 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
17292 // Emit a CMP with 0, which is the TEST pattern.
17293 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17294 DAG.getConstant(0, dl, Op.getValueType()));
17296 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17297 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
17299 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
17300 DAG.ReplaceAllUsesWith(Op, New);
17301 return SDValue(New.getNode(), 1);
17304 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
17306 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
17307 const SDLoc &dl, SelectionDAG &DAG) const {
17308 if (isNullConstant(Op1))
17309 return EmitTest(Op0, X86CC, dl, DAG);
17311 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
17312 "Unexpected comparison operation for MVT::i1 operands");
17314 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
17315 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
17316 // Only promote the compare up to I32 if it is a 16 bit operation
17317 // with an immediate. 16 bit immediates are to be avoided.
17318 if ((Op0.getValueType() == MVT::i16 &&
17319 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
17320 !DAG.getMachineFunction().getFunction().optForMinSize() &&
17321 !Subtarget.isAtom()) {
17322 unsigned ExtendOp =
17323 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17324 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
17325 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
17327 // Use SUB instead of CMP to enable CSE between SUB and CMP.
17328 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
17329 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
17330 return SDValue(Sub.getNode(), 1);
17332 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
17335 /// Convert a comparison if required by the subtarget.
17336 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
17337 SelectionDAG &DAG) const {
17338 // If the subtarget does not support the FUCOMI instruction, floating-point
17339 // comparisons have to be converted.
17340 if (Subtarget.hasCMov() ||
17341 Cmp.getOpcode() != X86ISD::CMP ||
17342 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
17343 !Cmp.getOperand(1).getValueType().isFloatingPoint())
17346 // The instruction selector will select an FUCOM instruction instead of
17347 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
17348 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
17349 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
17351 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
17352 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
17353 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
17354 DAG.getConstant(8, dl, MVT::i8));
17355 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
17357 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
17358 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
17359 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
17362 /// Check if replacement of SQRT with RSQRT should be disabled.
17363 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
17364 EVT VT = Op.getValueType();
17366 // We never want to use both SQRT and RSQRT instructions for the same input.
17367 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
17371 return Subtarget.hasFastVectorFSQRT();
17372 return Subtarget.hasFastScalarFSQRT();
17375 /// The minimum architected relative accuracy is 2^-12. We need one
17376 /// Newton-Raphson step to have a good float result (24 bits of precision).
17377 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
17378 SelectionDAG &DAG, int Enabled,
17379 int &RefinementSteps,
17380 bool &UseOneConstNR,
17381 bool Reciprocal) const {
17382 EVT VT = Op.getValueType();
17384 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
17385 // TODO: Add support for AVX512 (v16f32).
17386 // It is likely not profitable to do this for f64 because a double-precision
17387 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
17388 // instructions: convert to single, rsqrtss, convert back to double, refine
17389 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
17390 // along with FMA, this could be a throughput win.
17391 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
17392 // after legalize types.
17393 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17394 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
17395 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
17396 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17397 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17398 RefinementSteps = 1;
17400 UseOneConstNR = false;
17401 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
17406 /// The minimum architected relative accuracy is 2^-12. We need one
17407 /// Newton-Raphson step to have a good float result (24 bits of precision).
17408 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
17410 int &RefinementSteps) const {
17411 EVT VT = Op.getValueType();
17413 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
17414 // TODO: Add support for AVX512 (v16f32).
17415 // It is likely not profitable to do this for f64 because a double-precision
17416 // reciprocal estimate with refinement on x86 prior to FMA requires
17417 // 15 instructions: convert to single, rcpss, convert back to double, refine
17418 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
17419 // along with FMA, this could be a throughput win.
17421 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17422 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
17423 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17424 // Enable estimate codegen with 1 refinement step for vector division.
17425 // Scalar division estimates are disabled because they break too much
17426 // real-world code. These defaults are intended to match GCC behavior.
17427 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
17430 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17431 RefinementSteps = 1;
17433 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
17438 /// If we have at least two divisions that use the same divisor, convert to
17439 /// multiplication by a reciprocal. This may need to be adjusted for a given
17440 /// CPU if a division's cost is not at least twice the cost of a multiplication.
17441 /// This is because we still need one division to calculate the reciprocal and
17442 /// then we need two multiplies by that reciprocal as replacements for the
17443 /// original divisions.
17444 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
17448 /// Helper for creating a X86ISD::SETCC node.
17449 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
17450 SelectionDAG &DAG) {
17451 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17452 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
17455 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
17456 /// according to equal/not-equal condition code \p CC.
17457 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
17458 const SDLoc &dl, SelectionDAG &DAG) {
17459 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
17460 // instruction. Since the shift amount is in-range-or-undefined, we know
17461 // that doing a bittest on the i32 value is ok. We extend to i32 because
17462 // the encoding for the i16 version is larger than the i32 version.
17463 // Also promote i16 to i32 for performance / code size reason.
17464 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
17465 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
17467 // See if we can use the 32-bit instruction instead of the 64-bit one for a
17468 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
17469 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
17470 // known to be zero.
17471 if (Src.getValueType() == MVT::i64 &&
17472 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
17473 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
17475 // If the operand types disagree, extend the shift amount to match. Since
17476 // BT ignores high bits (like shifts) we can use anyextend.
17477 if (Src.getValueType() != BitNo.getValueType())
17478 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
17480 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
17481 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
17482 return getSETCC(Cond, BT, dl , DAG);
17485 /// Result of 'and' is compared against zero. Change to a BT node if possible.
17486 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
17487 const SDLoc &dl, SelectionDAG &DAG) {
17488 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
17489 SDValue Op0 = And.getOperand(0);
17490 SDValue Op1 = And.getOperand(1);
17491 if (Op0.getOpcode() == ISD::TRUNCATE)
17492 Op0 = Op0.getOperand(0);
17493 if (Op1.getOpcode() == ISD::TRUNCATE)
17494 Op1 = Op1.getOperand(0);
17497 if (Op1.getOpcode() == ISD::SHL)
17498 std::swap(Op0, Op1);
17499 if (Op0.getOpcode() == ISD::SHL) {
17500 if (isOneConstant(Op0.getOperand(0))) {
17501 // If we looked past a truncate, check that it's only truncating away
17503 unsigned BitWidth = Op0.getValueSizeInBits();
17504 unsigned AndBitWidth = And.getValueSizeInBits();
17505 if (BitWidth > AndBitWidth) {
17507 DAG.computeKnownBits(Op0, Known);
17508 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
17512 RHS = Op0.getOperand(1);
17514 } else if (Op1.getOpcode() == ISD::Constant) {
17515 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
17516 uint64_t AndRHSVal = AndRHS->getZExtValue();
17517 SDValue AndLHS = Op0;
17519 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
17520 LHS = AndLHS.getOperand(0);
17521 RHS = AndLHS.getOperand(1);
17524 // Use BT if the immediate can't be encoded in a TEST instruction.
17525 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
17527 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
17532 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
17537 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
17539 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
17544 // SSE Condition code mapping:
17553 switch (SetCCOpcode) {
17554 default: llvm_unreachable("Unexpected SETCC condition");
17556 case ISD::SETEQ: SSECC = 0; break;
17558 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
17560 case ISD::SETOLT: SSECC = 1; break;
17562 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
17564 case ISD::SETOLE: SSECC = 2; break;
17565 case ISD::SETUO: SSECC = 3; break;
17567 case ISD::SETNE: SSECC = 4; break;
17568 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
17569 case ISD::SETUGE: SSECC = 5; break;
17570 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17571 case ISD::SETUGT: SSECC = 6; break;
17572 case ISD::SETO: SSECC = 7; break;
17573 case ISD::SETUEQ: SSECC = 8; break;
17574 case ISD::SETONE: SSECC = 12; break;
17577 std::swap(Op0, Op1);
17582 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17583 /// concatenate the result back.
17584 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17585 MVT VT = Op.getSimpleValueType();
17587 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
17588 "Unsupported value type for operation");
17590 unsigned NumElems = VT.getVectorNumElements();
17592 SDValue CC = Op.getOperand(2);
17594 // Extract the LHS vectors
17595 SDValue LHS = Op.getOperand(0);
17596 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17597 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17599 // Extract the RHS vectors
17600 SDValue RHS = Op.getOperand(1);
17601 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17602 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17604 // Issue the operation on the smaller types and concatenate the result back
17605 MVT EltVT = VT.getVectorElementType();
17606 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17607 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17608 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17609 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
17612 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17613 SDValue Op0 = Op.getOperand(0);
17614 SDValue Op1 = Op.getOperand(1);
17615 SDValue CC = Op.getOperand(2);
17616 MVT VT = Op.getSimpleValueType();
17619 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
17620 "Unexpected type for boolean compare operation");
17621 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17622 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17623 DAG.getConstant(-1, dl, VT));
17624 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17625 DAG.getConstant(-1, dl, VT));
17626 switch (SetCCOpcode) {
17627 default: llvm_unreachable("Unexpected SETCC condition");
17629 // (x == y) -> ~(x ^ y)
17630 return DAG.getNode(ISD::XOR, dl, VT,
17631 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17632 DAG.getConstant(-1, dl, VT));
17634 // (x != y) -> (x ^ y)
17635 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17638 // (x > y) -> (x & ~y)
17639 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17642 // (x < y) -> (~x & y)
17643 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17646 // (x <= y) -> (~x | y)
17647 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17650 // (x >=y) -> (x | ~y)
17651 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17655 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17657 SDValue Op0 = Op.getOperand(0);
17658 SDValue Op1 = Op.getOperand(1);
17659 SDValue CC = Op.getOperand(2);
17660 MVT VT = Op.getSimpleValueType();
17663 assert(VT.getVectorElementType() == MVT::i1 &&
17664 "Cannot set masked compare for this operation");
17666 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17668 bool Unsigned = false;
17671 switch (SetCCOpcode) {
17672 default: llvm_unreachable("Unexpected SETCC condition");
17673 case ISD::SETNE: SSECC = 4; break;
17674 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
17675 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
17676 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
17677 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
17678 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17679 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17680 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
17681 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
17682 case ISD::SETLE: SSECC = 2; break;
17686 std::swap(Op0, Op1);
17688 // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.
17689 if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {
17690 SDValue A = peekThroughBitcasts(Op0);
17691 if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&
17692 ISD::isBuildVectorAllZeros(Op1.getNode())) {
17693 MVT VT0 = Op0.getSimpleValueType();
17694 SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
17695 SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
17696 return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
17702 return DAG.getNode(Opc, dl, VT, Op0, Op1);
17703 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17704 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17705 DAG.getConstant(SSECC, dl, MVT::i8));
17708 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17709 /// operand \p Op1. If non-trivial (for example because it's not constant)
17710 /// return an empty value.
17711 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17712 SelectionDAG &DAG) {
17713 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17717 MVT VT = Op1.getSimpleValueType();
17718 MVT EVT = VT.getVectorElementType();
17719 unsigned n = VT.getVectorNumElements();
17720 SmallVector<SDValue, 8> ULTOp1;
17722 for (unsigned i = 0; i < n; ++i) {
17723 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17724 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17727 // Avoid underflow.
17728 APInt Val = Elt->getAPIntValue();
17732 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17735 return DAG.getBuildVector(VT, dl, ULTOp1);
17738 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17739 SelectionDAG &DAG) {
17740 SDValue Op0 = Op.getOperand(0);
17741 SDValue Op1 = Op.getOperand(1);
17742 SDValue CC = Op.getOperand(2);
17743 MVT VT = Op.getSimpleValueType();
17744 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17745 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17750 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17751 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17755 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17756 assert(VT.getVectorNumElements() <= 16);
17757 Opc = X86ISD::CMPM;
17759 Opc = X86ISD::CMPP;
17760 // The SSE/AVX packed FP comparison nodes are defined with a
17761 // floating-point vector result that matches the operand type. This allows
17762 // them to work with an SSE1 target (integer vector types are not legal).
17763 VT = Op0.getSimpleValueType();
17766 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17767 // emit two comparisons and a logic op to tie them together.
17769 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17770 if (SSECC >= 8 && !Subtarget.hasAVX()) {
17771 // LLVM predicate is SETUEQ or SETONE.
17773 unsigned CombineOpc;
17774 if (Cond == ISD::SETUEQ) {
17777 CombineOpc = X86ISD::FOR;
17779 assert(Cond == ISD::SETONE);
17782 CombineOpc = X86ISD::FAND;
17785 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17786 DAG.getConstant(CC0, dl, MVT::i8));
17787 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17788 DAG.getConstant(CC1, dl, MVT::i8));
17789 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17791 // Handle all other FP comparisons here.
17792 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17793 DAG.getConstant(SSECC, dl, MVT::i8));
17796 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17797 // result type of SETCC. The bitcast is expected to be optimized away
17798 // during combining/isel.
17799 if (Opc == X86ISD::CMPP)
17800 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17805 MVT VTOp0 = Op0.getSimpleValueType();
17806 assert(VTOp0 == Op1.getSimpleValueType() &&
17807 "Expected operands with same type!");
17808 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17809 "Invalid number of packed elements for source and destination!");
17811 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17812 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17813 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17814 // legalizer firstly checks if the first operand in input to the setcc has
17815 // a legal type. If so, then it promotes the return type to that same type.
17816 // Otherwise, the return type is promoted to the 'next legal type' which,
17817 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17819 // We reach this code only if the following two conditions are met:
17820 // 1. Both return type and operand type have been promoted to wider types
17821 // by the type legalizer.
17822 // 2. The original operand type has been promoted to a 256-bit vector.
17824 // Note that condition 2. only applies for AVX targets.
17825 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17826 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17829 // The non-AVX512 code below works under the assumption that source and
17830 // destination types are the same.
17831 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17832 "Value types for source and destination must be the same!");
17834 // Break 256-bit integer vector compare into smaller ones.
17835 if (VT.is256BitVector() && !Subtarget.hasInt256())
17836 return Lower256IntVSETCC(Op, DAG);
17838 // Operands are boolean (vectors of i1)
17839 MVT OpVT = Op1.getSimpleValueType();
17840 if (OpVT.getVectorElementType() == MVT::i1)
17841 return LowerBoolVSETCC_AVX512(Op, DAG);
17843 // The result is boolean, but operands are int/float
17844 if (VT.getVectorElementType() == MVT::i1) {
17845 // In AVX-512 architecture setcc returns mask with i1 elements,
17846 // But there is no compare instruction for i8 and i16 elements in KNL.
17847 // In this case use SSE compare
17848 bool UseAVX512Inst =
17849 (OpVT.is512BitVector() ||
17850 OpVT.getScalarSizeInBits() >= 32 ||
17851 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17854 return LowerIntVSETCC_AVX512(Op, DAG);
17856 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17857 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17860 // Lower using XOP integer comparisons.
17861 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17862 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17863 // Translate compare code to XOP PCOM compare mode.
17864 unsigned CmpMode = 0;
17866 default: llvm_unreachable("Unexpected SETCC condition");
17868 case ISD::SETLT: CmpMode = 0x00; break;
17870 case ISD::SETLE: CmpMode = 0x01; break;
17872 case ISD::SETGT: CmpMode = 0x02; break;
17874 case ISD::SETGE: CmpMode = 0x03; break;
17875 case ISD::SETEQ: CmpMode = 0x04; break;
17876 case ISD::SETNE: CmpMode = 0x05; break;
17879 // Are we comparing unsigned or signed integers?
17881 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
17883 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17884 DAG.getConstant(CmpMode, dl, MVT::i8));
17887 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
17888 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
17889 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
17890 SDValue BC0 = peekThroughBitcasts(Op0);
17891 if (BC0.getOpcode() == ISD::AND) {
17893 SmallVector<APInt, 64> EltBits;
17894 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
17895 VT.getScalarSizeInBits(), UndefElts,
17896 EltBits, false, false)) {
17897 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
17899 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
17905 // We are handling one of the integer comparisons here. Since SSE only has
17906 // GT and EQ comparisons for integer, swapping operands and multiple
17907 // operations may be required for some comparisons.
17908 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
17910 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
17911 Cond == ISD::SETGE || Cond == ISD::SETUGE;
17912 bool Invert = Cond == ISD::SETNE ||
17913 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
17915 // If both operands are known non-negative, then an unsigned compare is the
17916 // same as a signed compare and there's no need to flip signbits.
17917 // TODO: We could check for more general simplifications here since we're
17918 // computing known bits.
17919 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
17920 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
17922 // Special case: Use min/max operations for SETULE/SETUGE
17923 MVT VET = VT.getVectorElementType();
17925 (Subtarget.hasAVX512() && VET == MVT::i64) ||
17926 (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) ||
17927 (Subtarget.hasSSE2() && (VET == MVT::i8));
17928 bool MinMax = false;
17932 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17933 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17937 Swap = Invert = FlipSigns = false;
17940 bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17941 bool Subus = false;
17942 if (!MinMax && HasSubus) {
17943 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17945 // t = psubus Op0, Op1
17946 // pcmpeq t, <0..0>
17949 case ISD::SETULT: {
17950 // If the comparison is against a constant we can turn this into a
17951 // setule. With psubus, setule does not require a swap. This is
17952 // beneficial because the constant in the register is no longer
17953 // destructed as the destination so it can be hoisted out of a loop.
17954 // Only do this pre-AVX since vpcmp* is no longer destructive.
17955 if (Subtarget.hasAVX())
17957 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17959 Subus = true; Invert = false; Swap = false;
17963 // Psubus is better than flip-sign because it requires no inversion.
17964 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17965 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17969 Opc = X86ISD::SUBUS;
17975 std::swap(Op0, Op1);
17977 // Check that the operation in question is available (most are plain SSE2,
17978 // but PCMPGTQ and PCMPEQQ have different requirements).
17979 if (VT == MVT::v2i64) {
17980 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17981 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17983 // First cast everything to the right type.
17984 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17985 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17987 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17988 // bits of the inputs before performing those operations. The lower
17989 // compare is always unsigned.
17992 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17994 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17995 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17996 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17998 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17999 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
18001 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
18002 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
18003 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
18005 // Create masks for only the low parts/high parts of the 64 bit integers.
18006 static const int MaskHi[] = { 1, 1, 3, 3 };
18007 static const int MaskLo[] = { 0, 0, 2, 2 };
18008 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
18009 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
18010 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
18012 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
18013 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
18016 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18018 return DAG.getBitcast(VT, Result);
18021 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
18022 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
18023 // pcmpeqd + pshufd + pand.
18024 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
18026 // First cast everything to the right type.
18027 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18028 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18031 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
18033 // Make sure the lower and upper halves are both all-ones.
18034 static const int Mask[] = { 1, 0, 3, 2 };
18035 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
18036 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
18039 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18041 return DAG.getBitcast(VT, Result);
18045 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18046 // bits of the inputs before performing those operations.
18048 MVT EltVT = VT.getVectorElementType();
18049 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
18051 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
18052 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
18055 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18057 // If the logical-not of the result is required, perform that now.
18059 Result = DAG.getNOT(dl, Result, VT);
18062 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
18065 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
18066 getZeroVector(VT, Subtarget, DAG, dl));
18071 // Try to select this as a KTEST+SETCC if possible.
18072 static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
18073 const SDLoc &dl, SelectionDAG &DAG,
18074 const X86Subtarget &Subtarget) {
18075 // Only support equality comparisons.
18076 if (CC != ISD::SETEQ && CC != ISD::SETNE)
18079 // Must be a bitcast from vXi1.
18080 if (Op0.getOpcode() != ISD::BITCAST)
18083 Op0 = Op0.getOperand(0);
18084 MVT VT = Op0.getSimpleValueType();
18085 if (!(Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) &&
18086 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
18089 X86::CondCode X86CC;
18090 if (isNullConstant(Op1)) {
18091 X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
18095 SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0);
18096 return getSETCC(X86CC, KTEST, dl, DAG);
18099 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
18101 MVT VT = Op.getSimpleValueType();
18103 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
18105 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
18106 SDValue Op0 = Op.getOperand(0);
18107 SDValue Op1 = Op.getOperand(1);
18109 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18111 // Optimize to BT if possible.
18112 // Lower (X & (1 << N)) == 0 to BT(X, N).
18113 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
18114 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
18115 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
18116 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18117 if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
18121 // Try to lower using KTEST.
18122 if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
18125 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
18127 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
18128 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18130 // If the input is a setcc, then reuse the input setcc or use a new one with
18131 // the inverted condition.
18132 if (Op0.getOpcode() == X86ISD::SETCC) {
18133 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
18134 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
18138 CCode = X86::GetOppositeBranchCondition(CCode);
18139 return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
18143 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
18144 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
18145 if (X86CC == X86::COND_INVALID)
18148 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
18149 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
18150 return getSETCC(X86CC, EFLAGS, dl, DAG);
18153 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
18154 SDValue LHS = Op.getOperand(0);
18155 SDValue RHS = Op.getOperand(1);
18156 SDValue Carry = Op.getOperand(2);
18157 SDValue Cond = Op.getOperand(3);
18160 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
18161 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
18163 // Recreate the carry if needed.
18164 EVT CarryVT = Carry.getValueType();
18165 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
18166 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
18167 Carry, DAG.getConstant(NegOne, DL, CarryVT));
18169 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18170 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
18171 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
18174 /// Return true if opcode is a X86 logical comparison.
18175 static bool isX86LogicalCmp(SDValue Op) {
18176 unsigned Opc = Op.getOpcode();
18177 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
18178 Opc == X86ISD::SAHF)
18180 if (Op.getResNo() == 1 &&
18181 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
18182 Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
18183 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
18184 Opc == X86ISD::XOR || Opc == X86ISD::AND))
18187 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
18193 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
18194 if (V.getOpcode() != ISD::TRUNCATE)
18197 SDValue VOp0 = V.getOperand(0);
18198 unsigned InBits = VOp0.getValueSizeInBits();
18199 unsigned Bits = V.getValueSizeInBits();
18200 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
18203 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
18204 bool AddTest = true;
18205 SDValue Cond = Op.getOperand(0);
18206 SDValue Op1 = Op.getOperand(1);
18207 SDValue Op2 = Op.getOperand(2);
18209 MVT VT = Op1.getSimpleValueType();
18212 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
18213 // are available or VBLENDV if AVX is available.
18214 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
18215 if (Cond.getOpcode() == ISD::SETCC &&
18216 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
18217 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
18218 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
18219 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
18220 unsigned SSECC = translateX86FSETCC(
18221 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
18223 if (Subtarget.hasAVX512()) {
18224 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
18225 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
18226 assert(!VT.isVector() && "Not a scalar type?");
18227 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18230 if (SSECC < 8 || Subtarget.hasAVX()) {
18231 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
18232 DAG.getConstant(SSECC, DL, MVT::i8));
18234 // If we have AVX, we can use a variable vector select (VBLENDV) instead
18235 // of 3 logic instructions for size savings and potentially speed.
18236 // Unfortunately, there is no scalar form of VBLENDV.
18238 // If either operand is a constant, don't try this. We can expect to
18239 // optimize away at least one of the logic instructions later in that
18240 // case, so that sequence would be faster than a variable blend.
18242 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
18243 // uses XMM0 as the selection register. That may need just as many
18244 // instructions as the AND/ANDN/OR sequence due to register moves, so
18247 if (Subtarget.hasAVX() &&
18248 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
18250 // Convert to vectors, do a VSELECT, and convert back to scalar.
18251 // All of the conversions should be optimized away.
18253 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
18254 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
18255 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
18256 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
18258 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
18259 VCmp = DAG.getBitcast(VCmpVT, VCmp);
18261 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
18263 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18264 VSel, DAG.getIntPtrConstant(0, DL));
18266 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
18267 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
18268 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
18272 // AVX512 fallback is to lower selects of scalar floats to masked moves.
18273 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
18274 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
18275 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18278 // For v64i1 without 64-bit support we need to split and rejoin.
18279 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
18280 assert(Subtarget.hasBWI() && "Expected BWI to be legal");
18281 SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
18282 SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
18283 SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
18284 SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
18285 SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
18286 SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
18287 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
18290 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
18292 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
18293 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
18294 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
18295 Op1Scalar = Op1.getOperand(0);
18297 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
18298 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
18299 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
18300 Op2Scalar = Op2.getOperand(0);
18301 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
18302 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
18303 Op1Scalar, Op2Scalar);
18304 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
18305 return DAG.getBitcast(VT, newSelect);
18306 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
18307 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
18308 DAG.getIntPtrConstant(0, DL));
18312 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
18313 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
18314 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18315 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
18316 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18317 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
18318 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
18319 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
18322 if (Cond.getOpcode() == ISD::SETCC) {
18323 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
18325 // If the condition was updated, it's possible that the operands of the
18326 // select were also updated (for example, EmitTest has a RAUW). Refresh
18327 // the local references to the select operands in case they got stale.
18328 Op1 = Op.getOperand(1);
18329 Op2 = Op.getOperand(2);
18333 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
18334 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
18335 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
18336 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
18337 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
18338 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
18339 if (Cond.getOpcode() == X86ISD::SETCC &&
18340 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
18341 isNullConstant(Cond.getOperand(1).getOperand(1))) {
18342 SDValue Cmp = Cond.getOperand(1);
18343 unsigned CondCode =
18344 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
18346 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18347 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
18348 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
18349 SDValue CmpOp0 = Cmp.getOperand(0);
18351 // Apply further optimizations for special cases
18352 // (select (x != 0), -1, 0) -> neg & sbb
18353 // (select (x == 0), 0, -1) -> neg & sbb
18354 if (isNullConstant(Y) &&
18355 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
18356 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
18357 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
18358 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
18359 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18360 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18361 SDValue(Neg.getNode(), 1));
18365 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
18366 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
18367 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18369 SDValue Res = // Res = 0 or -1.
18370 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18371 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
18373 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
18374 Res = DAG.getNOT(DL, Res, Res.getValueType());
18376 if (!isNullConstant(Op2))
18377 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
18379 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
18380 Cmp.getOperand(0).getOpcode() == ISD::AND &&
18381 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
18382 SDValue CmpOp0 = Cmp.getOperand(0);
18383 SDValue Src1, Src2;
18384 // true if Op2 is XOR or OR operator and one of its operands
18386 // ( a , a op b) || ( b , a op b)
18387 auto isOrXorPattern = [&]() {
18388 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
18389 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
18391 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
18398 if (isOrXorPattern()) {
18400 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
18401 // we need mask of all zeros or ones with same size of the other
18403 if (CmpSz > VT.getSizeInBits())
18404 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
18405 else if (CmpSz < VT.getSizeInBits())
18406 Neg = DAG.getNode(ISD::AND, DL, VT,
18407 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
18408 DAG.getConstant(1, DL, VT));
18411 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
18412 Neg); // -(and (x, 0x1))
18413 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
18414 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
18419 // Look past (and (setcc_carry (cmp ...)), 1).
18420 if (Cond.getOpcode() == ISD::AND &&
18421 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18422 isOneConstant(Cond.getOperand(1)))
18423 Cond = Cond.getOperand(0);
18425 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18426 // setting operand in place of the X86ISD::SETCC.
18427 unsigned CondOpcode = Cond.getOpcode();
18428 if (CondOpcode == X86ISD::SETCC ||
18429 CondOpcode == X86ISD::SETCC_CARRY) {
18430 CC = Cond.getOperand(0);
18432 SDValue Cmp = Cond.getOperand(1);
18433 unsigned Opc = Cmp.getOpcode();
18434 MVT VT = Op.getSimpleValueType();
18436 bool IllegalFPCMov = false;
18437 if (VT.isFloatingPoint() && !VT.isVector() &&
18438 !isScalarFPTypeInSSEReg(VT)) // FPStack?
18439 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
18441 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
18442 Opc == X86ISD::BT) { // FIXME
18446 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18447 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18448 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18449 Cond.getOperand(0).getValueType() != MVT::i8)) {
18450 SDValue LHS = Cond.getOperand(0);
18451 SDValue RHS = Cond.getOperand(1);
18452 unsigned X86Opcode;
18455 switch (CondOpcode) {
18456 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18457 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18458 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18459 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18460 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18461 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18462 default: llvm_unreachable("unexpected overflowing operator");
18464 if (CondOpcode == ISD::UMULO)
18465 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18468 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18470 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
18472 if (CondOpcode == ISD::UMULO)
18473 Cond = X86Op.getValue(2);
18475 Cond = X86Op.getValue(1);
18477 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
18482 // Look past the truncate if the high bits are known zero.
18483 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18484 Cond = Cond.getOperand(0);
18486 // We know the result of AND is compared against zero. Try to match
18488 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
18489 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
18490 CC = NewSetCC.getOperand(0);
18491 Cond = NewSetCC.getOperand(1);
18498 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
18499 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
18502 // a < b ? -1 : 0 -> RES = ~setcc_carry
18503 // a < b ? 0 : -1 -> RES = setcc_carry
18504 // a >= b ? -1 : 0 -> RES = setcc_carry
18505 // a >= b ? 0 : -1 -> RES = ~setcc_carry
18506 if (Cond.getOpcode() == X86ISD::SUB) {
18507 Cond = ConvertCmpIfNecessary(Cond, DAG);
18508 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
18510 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
18511 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18512 (isNullConstant(Op1) || isNullConstant(Op2))) {
18513 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18514 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18516 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
18517 return DAG.getNOT(DL, Res, Res.getValueType());
18522 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
18523 // widen the cmov and push the truncate through. This avoids introducing a new
18524 // branch during isel and doesn't add any extensions.
18525 if (Op.getValueType() == MVT::i8 &&
18526 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
18527 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
18528 if (T1.getValueType() == T2.getValueType() &&
18529 // Blacklist CopyFromReg to avoid partial register stalls.
18530 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
18531 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
18533 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
18537 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
18538 // condition is true.
18539 SDValue Ops[] = { Op2, Op1, CC, Cond };
18540 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
18543 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
18544 const X86Subtarget &Subtarget,
18545 SelectionDAG &DAG) {
18546 MVT VT = Op->getSimpleValueType(0);
18547 SDValue In = Op->getOperand(0);
18548 MVT InVT = In.getSimpleValueType();
18549 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
18550 MVT VTElt = VT.getVectorElementType();
18553 unsigned NumElts = VT.getVectorNumElements();
18555 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
18557 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)
18558 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
18560 // Widen to 512-bits if VLX is not supported.
18561 MVT WideVT = ExtVT;
18562 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
18563 NumElts *= 512 / ExtVT.getSizeInBits();
18564 InVT = MVT::getVectorVT(MVT::i1, NumElts);
18565 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
18566 In, DAG.getIntPtrConstant(0, dl));
18567 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
18571 MVT WideEltVT = WideVT.getVectorElementType();
18572 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
18573 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
18574 V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG);
18576 SDValue NegOne = getOnesVector(WideVT, DAG, dl);
18577 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
18578 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
18581 // Truncate if we had to extend i16/i8 above.
18583 WideVT = MVT::getVectorVT(VTElt, NumElts);
18584 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
18587 // Extract back to 128/256-bit if we widened.
18589 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
18590 DAG.getIntPtrConstant(0, dl));
18595 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18596 SelectionDAG &DAG) {
18597 SDValue In = Op->getOperand(0);
18598 MVT InVT = In.getSimpleValueType();
18600 if (InVT.getVectorElementType() == MVT::i1)
18601 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
18603 if (Subtarget.hasFp256())
18604 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
18610 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
18611 // For sign extend this needs to handle all vector sizes and SSE4.1 and
18612 // non-SSE4.1 targets. For zero extend this should only handle inputs of
18613 // MVT::v64i8 when BWI is not supported, but AVX512 is.
18614 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
18615 const X86Subtarget &Subtarget,
18616 SelectionDAG &DAG) {
18617 SDValue In = Op->getOperand(0);
18618 MVT VT = Op->getSimpleValueType(0);
18619 MVT InVT = In.getSimpleValueType();
18620 assert(VT.getSizeInBits() == InVT.getSizeInBits());
18622 MVT SVT = VT.getVectorElementType();
18623 MVT InSVT = InVT.getVectorElementType();
18624 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
18626 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
18628 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
18630 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
18631 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
18632 !(VT.is512BitVector() && Subtarget.hasAVX512()))
18637 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
18638 // For 512-bit vectors, we need 128-bits or 256-bits.
18639 if (VT.getSizeInBits() > 128) {
18640 // Input needs to be at least the same number of elements as output, and
18641 // at least 128-bits.
18642 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
18643 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
18646 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
18647 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
18649 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
18650 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
18651 // need to be handled here for 256/512-bit results.
18652 if (Subtarget.hasInt256()) {
18653 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
18654 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
18655 X86ISD::VSEXT : X86ISD::VZEXT;
18656 return DAG.getNode(ExtOpc, dl, VT, In);
18659 // We should only get here for sign extend.
18660 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
18661 "Unexpected opcode!");
18663 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18667 // As SRAI is only available on i16/i32 types, we expand only up to i32
18668 // and handle i64 separately.
18669 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
18670 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18671 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18672 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18673 Curr = DAG.getBitcast(CurrVT, Curr);
18676 SDValue SignExt = Curr;
18677 if (CurrVT != InVT) {
18678 unsigned SignExtShift =
18679 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18680 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18681 DAG.getConstant(SignExtShift, dl, MVT::i8));
18687 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
18688 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18689 DAG.getConstant(31, dl, MVT::i8));
18690 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18691 return DAG.getBitcast(VT, Ext);
18697 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18698 SelectionDAG &DAG) {
18699 MVT VT = Op->getSimpleValueType(0);
18700 SDValue In = Op->getOperand(0);
18701 MVT InVT = In.getSimpleValueType();
18704 if (InVT.getVectorElementType() == MVT::i1)
18705 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
18707 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
18708 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
18709 (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
18710 (VT != MVT::v8i64 || InVT != MVT::v8i32) &&
18711 (VT != MVT::v8i64 || InVT != MVT::v8i16) &&
18712 (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
18713 (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
18714 (VT != MVT::v32i16 || InVT != MVT::v32i8))
18717 if (Subtarget.hasInt256())
18718 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18720 // Optimize vectors in AVX mode
18721 // Sign extend v8i16 to v8i32 and
18724 // Divide input vector into two parts
18725 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18726 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18727 // concat the vectors to original VT
18729 unsigned NumElems = InVT.getVectorNumElements();
18730 SDValue Undef = DAG.getUNDEF(InVT);
18732 SmallVector<int,8> ShufMask1(NumElems, -1);
18733 for (unsigned i = 0; i != NumElems/2; ++i)
18736 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18738 SmallVector<int,8> ShufMask2(NumElems, -1);
18739 for (unsigned i = 0; i != NumElems/2; ++i)
18740 ShufMask2[i] = i + NumElems/2;
18742 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18744 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18745 VT.getVectorNumElements() / 2);
18747 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18748 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18750 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18753 // Lower truncating store. We need a special lowering to vXi1 vectors
18754 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18755 SelectionDAG &DAG) {
18756 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18758 EVT MemVT = St->getMemoryVT();
18759 assert(St->isTruncatingStore() && "We only custom truncating store.");
18760 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
18761 "Expected truncstore of i1 vector");
18763 SDValue Op = St->getValue();
18764 MVT OpVT = Op.getValueType().getSimpleVT();
18765 unsigned NumElts = OpVT.getVectorNumElements();
18766 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
18768 // Truncate and store - everything is legal
18769 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18770 if (MemVT.getSizeInBits() < 8)
18771 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18772 DAG.getUNDEF(MVT::v8i1), Op,
18773 DAG.getIntPtrConstant(0, dl));
18774 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18775 St->getMemOperand());
18778 // A subset, assume that we have only AVX-512F
18779 if (NumElts <= 8) {
18781 // Extend to 8-elts vector
18782 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18783 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18784 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18786 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18787 Op = DAG.getBitcast(MVT::i8, Op);
18788 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18789 St->getMemOperand());
18792 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18793 // Divide the vector into 2 parts and store each part separately
18794 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18795 DAG.getIntPtrConstant(0, dl));
18796 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18797 SDValue BasePtr = St->getBasePtr();
18798 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18799 St->getMemOperand());
18800 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18801 DAG.getIntPtrConstant(16, dl));
18802 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18804 SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);
18806 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18807 BasePtrHi, St->getPointerInfo().getWithOffset(2),
18808 MinAlign(St->getAlignment(), 2U),
18809 St->getMemOperand()->getFlags());
18810 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18813 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18814 const X86Subtarget &Subtarget,
18815 SelectionDAG &DAG) {
18817 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18819 EVT MemVT = Ld->getMemoryVT();
18820 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18821 "Expected i1 vector load");
18822 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18823 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18824 MVT VT = Op.getValueType().getSimpleVT();
18825 unsigned NumElts = VT.getVectorNumElements();
18827 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18828 (Subtarget.hasDQI() && NumElts < 16) ||
18830 // Load and extend - everything is legal
18832 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18834 Ld->getMemOperand());
18835 // Replace chain users with the new chain.
18836 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18837 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18838 if (Subtarget.hasVLX()) {
18839 // Extract to v4i1/v2i1.
18840 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Load,
18841 DAG.getIntPtrConstant(0, dl));
18842 // Finally, do a normal sign-extend to the desired register.
18843 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
18846 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18847 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18849 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18850 DAG.getIntPtrConstant(0, dl));
18852 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18854 Ld->getMemOperand());
18855 // Replace chain users with the new chain.
18856 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18857 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18859 // Finally, do a normal sign-extend to the desired register.
18860 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18863 if (NumElts <= 8) {
18864 // A subset, assume that we have only AVX-512F
18865 SDValue Load = DAG.getLoad(MVT::i8, dl, Ld->getChain(),
18867 Ld->getMemOperand());
18868 // Replace chain users with the new chain.
18869 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18870 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18872 SDValue BitVec = DAG.getBitcast(MVT::v8i1, Load);
18875 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18877 if (Subtarget.hasVLX()) {
18878 // Extract to v4i1/v2i1.
18879 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, BitVec,
18880 DAG.getIntPtrConstant(0, dl));
18881 // Finally, do a normal sign-extend to the desired register.
18882 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
18885 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18886 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18887 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18888 DAG.getIntPtrConstant(0, dl));
18891 assert(VT == MVT::v32i8 && "Unexpected extload type");
18893 SDValue BasePtr = Ld->getBasePtr();
18894 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18896 Ld->getMemOperand());
18898 SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);
18900 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), BasePtrHi,
18901 Ld->getPointerInfo().getWithOffset(2),
18902 MinAlign(Ld->getAlignment(), 2U),
18903 Ld->getMemOperand()->getFlags());
18905 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18906 LoadLo.getValue(1), LoadHi.getValue(1));
18907 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18909 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18910 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18911 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18914 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18915 // may emit an illegal shuffle but the expansion is still better than scalar
18916 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18917 // we'll emit a shuffle and a arithmetic shift.
18918 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18919 // TODO: It is possible to support ZExt by zeroing the undef values during
18920 // the shuffle phase or after the shuffle.
18921 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18922 SelectionDAG &DAG) {
18923 MVT RegVT = Op.getSimpleValueType();
18924 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18925 assert(RegVT.isInteger() &&
18926 "We only custom lower integer vector sext loads.");
18928 // Nothing useful we can do without SSE2 shuffles.
18929 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18931 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18933 EVT MemVT = Ld->getMemoryVT();
18934 if (MemVT.getScalarType() == MVT::i1)
18935 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18937 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18938 unsigned RegSz = RegVT.getSizeInBits();
18940 ISD::LoadExtType Ext = Ld->getExtensionType();
18942 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18943 && "Only anyext and sext are currently implemented.");
18944 assert(MemVT != RegVT && "Cannot extend to the same type");
18945 assert(MemVT.isVector() && "Must load a vector from memory");
18947 unsigned NumElems = RegVT.getVectorNumElements();
18948 unsigned MemSz = MemVT.getSizeInBits();
18949 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18951 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18952 // The only way in which we have a legal 256-bit vector result but not the
18953 // integer 256-bit operations needed to directly lower a sextload is if we
18954 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18955 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18956 // correctly legalized. We do this late to allow the canonical form of
18957 // sextload to persist throughout the rest of the DAG combiner -- it wants
18958 // to fold together any extensions it can, and so will fuse a sign_extend
18959 // of an sextload into a sextload targeting a wider value.
18961 if (MemSz == 128) {
18962 // Just switch this to a normal load.
18963 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18964 "it must be a legal 128-bit vector "
18966 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18967 Ld->getPointerInfo(), Ld->getAlignment(),
18968 Ld->getMemOperand()->getFlags());
18970 assert(MemSz < 128 &&
18971 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18972 // Do an sext load to a 128-bit vector type. We want to use the same
18973 // number of elements, but elements half as wide. This will end up being
18974 // recursively lowered by this routine, but will succeed as we definitely
18975 // have all the necessary features if we're using AVX1.
18977 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18978 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18980 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18981 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18982 Ld->getMemOperand()->getFlags());
18985 // Replace chain users with the new chain.
18986 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18987 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18989 // Finally, do a normal sign-extend to the desired register.
18990 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18993 // All sizes must be a power of two.
18994 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18995 "Non-power-of-two elements are not custom lowered!");
18997 // Attempt to load the original value using scalar loads.
18998 // Find the largest scalar type that divides the total loaded size.
18999 MVT SclrLoadTy = MVT::i8;
19000 for (MVT Tp : MVT::integer_valuetypes()) {
19001 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
19006 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
19007 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
19009 SclrLoadTy = MVT::f64;
19011 // Calculate the number of scalar loads that we need to perform
19012 // in order to load our vector from memory.
19013 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
19015 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
19016 "Can only lower sext loads with a single scalar load!");
19018 unsigned loadRegZize = RegSz;
19019 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
19022 // If we don't have BWI we won't be able to create the shuffle needed for
19024 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19025 MemVT == MVT::v8i8)
19028 // Represent our vector as a sequence of elements which are the
19029 // largest scalar that we can load.
19030 EVT LoadUnitVecVT = EVT::getVectorVT(
19031 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
19033 // Represent the data using the same element type that is stored in
19034 // memory. In practice, we ''widen'' MemVT.
19036 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
19037 loadRegZize / MemVT.getScalarSizeInBits());
19039 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
19040 "Invalid vector type");
19042 // We can't shuffle using an illegal type.
19043 assert(TLI.isTypeLegal(WideVecVT) &&
19044 "We only lower types that form legal widened vector types");
19046 SmallVector<SDValue, 8> Chains;
19047 SDValue Ptr = Ld->getBasePtr();
19048 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
19049 TLI.getPointerTy(DAG.getDataLayout()));
19050 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
19052 for (unsigned i = 0; i < NumLoads; ++i) {
19053 // Perform a single load.
19054 SDValue ScalarLoad =
19055 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
19056 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
19057 Chains.push_back(ScalarLoad.getValue(1));
19058 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
19059 // another round of DAGCombining.
19061 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
19063 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
19064 ScalarLoad, DAG.getIntPtrConstant(i, dl));
19066 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
19069 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
19071 // Bitcast the loaded value to a vector of the original element type, in
19072 // the size of the target vector type.
19073 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
19074 unsigned SizeRatio = RegSz / MemSz;
19076 if (Ext == ISD::SEXTLOAD) {
19077 // If we have SSE4.1, we can directly emit a VSEXT node.
19078 if (Subtarget.hasSSE41()) {
19079 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
19080 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19084 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
19086 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
19087 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
19089 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
19090 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19094 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19095 MemVT == MVT::v8i8) {
19096 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
19097 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19101 // Redistribute the loaded elements into the different locations.
19102 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
19103 for (unsigned i = 0; i != NumElems; ++i)
19104 ShuffleVec[i * SizeRatio] = i;
19106 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
19107 DAG.getUNDEF(WideVecVT), ShuffleVec);
19109 // Bitcast to the requested type.
19110 Shuff = DAG.getBitcast(RegVT, Shuff);
19111 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19115 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
19116 /// each of which has no other use apart from the AND / OR.
19117 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
19118 Opc = Op.getOpcode();
19119 if (Opc != ISD::OR && Opc != ISD::AND)
19121 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19122 Op.getOperand(0).hasOneUse() &&
19123 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
19124 Op.getOperand(1).hasOneUse());
19127 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
19128 /// SETCC node has a single use.
19129 static bool isXor1OfSetCC(SDValue Op) {
19130 if (Op.getOpcode() != ISD::XOR)
19132 if (isOneConstant(Op.getOperand(1)))
19133 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19134 Op.getOperand(0).hasOneUse();
19138 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
19139 bool addTest = true;
19140 SDValue Chain = Op.getOperand(0);
19141 SDValue Cond = Op.getOperand(1);
19142 SDValue Dest = Op.getOperand(2);
19145 bool Inverted = false;
19147 if (Cond.getOpcode() == ISD::SETCC) {
19148 // Check for setcc([su]{add,sub,mul}o == 0).
19149 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
19150 isNullConstant(Cond.getOperand(1)) &&
19151 Cond.getOperand(0).getResNo() == 1 &&
19152 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
19153 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
19154 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
19155 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
19156 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
19157 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
19159 Cond = Cond.getOperand(0);
19161 if (SDValue NewCond = LowerSETCC(Cond, DAG))
19166 // FIXME: LowerXALUO doesn't handle these!!
19167 else if (Cond.getOpcode() == X86ISD::ADD ||
19168 Cond.getOpcode() == X86ISD::SUB ||
19169 Cond.getOpcode() == X86ISD::SMUL ||
19170 Cond.getOpcode() == X86ISD::UMUL)
19171 Cond = LowerXALUO(Cond, DAG);
19174 // Look pass (and (setcc_carry (cmp ...)), 1).
19175 if (Cond.getOpcode() == ISD::AND &&
19176 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19177 isOneConstant(Cond.getOperand(1)))
19178 Cond = Cond.getOperand(0);
19180 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19181 // setting operand in place of the X86ISD::SETCC.
19182 unsigned CondOpcode = Cond.getOpcode();
19183 if (CondOpcode == X86ISD::SETCC ||
19184 CondOpcode == X86ISD::SETCC_CARRY) {
19185 CC = Cond.getOperand(0);
19187 SDValue Cmp = Cond.getOperand(1);
19188 unsigned Opc = Cmp.getOpcode();
19189 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
19190 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
19194 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
19198 // These can only come from an arithmetic instruction with overflow,
19199 // e.g. SADDO, UADDO.
19200 Cond = Cond.getOperand(1);
19206 CondOpcode = Cond.getOpcode();
19207 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19208 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19209 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19210 Cond.getOperand(0).getValueType() != MVT::i8)) {
19211 SDValue LHS = Cond.getOperand(0);
19212 SDValue RHS = Cond.getOperand(1);
19213 unsigned X86Opcode;
19216 // Keep this in sync with LowerXALUO, otherwise we might create redundant
19217 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
19219 switch (CondOpcode) {
19220 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19222 if (isOneConstant(RHS)) {
19223 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
19226 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19227 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19229 if (isOneConstant(RHS)) {
19230 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
19233 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19234 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19235 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19236 default: llvm_unreachable("unexpected overflowing operator");
19239 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
19240 if (CondOpcode == ISD::UMULO)
19241 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19244 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19246 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
19248 if (CondOpcode == ISD::UMULO)
19249 Cond = X86Op.getValue(2);
19251 Cond = X86Op.getValue(1);
19253 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19257 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
19258 SDValue Cmp = Cond.getOperand(0).getOperand(1);
19259 if (CondOpc == ISD::OR) {
19260 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
19261 // two branches instead of an explicit OR instruction with a
19263 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19264 isX86LogicalCmp(Cmp)) {
19265 CC = Cond.getOperand(0).getOperand(0);
19266 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19267 Chain, Dest, CC, Cmp);
19268 CC = Cond.getOperand(1).getOperand(0);
19272 } else { // ISD::AND
19273 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
19274 // two branches instead of an explicit AND instruction with a
19275 // separate test. However, we only do this if this block doesn't
19276 // have a fall-through edge, because this requires an explicit
19277 // jmp when the condition is false.
19278 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19279 isX86LogicalCmp(Cmp) &&
19280 Op.getNode()->hasOneUse()) {
19281 X86::CondCode CCode =
19282 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19283 CCode = X86::GetOppositeBranchCondition(CCode);
19284 CC = DAG.getConstant(CCode, dl, MVT::i8);
19285 SDNode *User = *Op.getNode()->use_begin();
19286 // Look for an unconditional branch following this conditional branch.
19287 // We need this because we need to reverse the successors in order
19288 // to implement FCMP_OEQ.
19289 if (User->getOpcode() == ISD::BR) {
19290 SDValue FalseBB = User->getOperand(1);
19292 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19293 assert(NewBR == User);
19297 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19298 Chain, Dest, CC, Cmp);
19299 X86::CondCode CCode =
19300 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
19301 CCode = X86::GetOppositeBranchCondition(CCode);
19302 CC = DAG.getConstant(CCode, dl, MVT::i8);
19308 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
19309 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
19310 // It should be transformed during dag combiner except when the condition
19311 // is set by a arithmetics with overflow node.
19312 X86::CondCode CCode =
19313 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19314 CCode = X86::GetOppositeBranchCondition(CCode);
19315 CC = DAG.getConstant(CCode, dl, MVT::i8);
19316 Cond = Cond.getOperand(0).getOperand(1);
19318 } else if (Cond.getOpcode() == ISD::SETCC &&
19319 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
19320 // For FCMP_OEQ, we can emit
19321 // two branches instead of an explicit AND instruction with a
19322 // separate test. However, we only do this if this block doesn't
19323 // have a fall-through edge, because this requires an explicit
19324 // jmp when the condition is false.
19325 if (Op.getNode()->hasOneUse()) {
19326 SDNode *User = *Op.getNode()->use_begin();
19327 // Look for an unconditional branch following this conditional branch.
19328 // We need this because we need to reverse the successors in order
19329 // to implement FCMP_OEQ.
19330 if (User->getOpcode() == ISD::BR) {
19331 SDValue FalseBB = User->getOperand(1);
19333 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19334 assert(NewBR == User);
19338 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19339 Cond.getOperand(0), Cond.getOperand(1));
19340 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19341 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19342 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19343 Chain, Dest, CC, Cmp);
19344 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
19349 } else if (Cond.getOpcode() == ISD::SETCC &&
19350 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
19351 // For FCMP_UNE, we can emit
19352 // two branches instead of an explicit AND instruction with a
19353 // separate test. However, we only do this if this block doesn't
19354 // have a fall-through edge, because this requires an explicit
19355 // jmp when the condition is false.
19356 if (Op.getNode()->hasOneUse()) {
19357 SDNode *User = *Op.getNode()->use_begin();
19358 // Look for an unconditional branch following this conditional branch.
19359 // We need this because we need to reverse the successors in order
19360 // to implement FCMP_UNE.
19361 if (User->getOpcode() == ISD::BR) {
19362 SDValue FalseBB = User->getOperand(1);
19364 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19365 assert(NewBR == User);
19368 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19369 Cond.getOperand(0), Cond.getOperand(1));
19370 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19371 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19372 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19373 Chain, Dest, CC, Cmp);
19374 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
19384 // Look pass the truncate if the high bits are known zero.
19385 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19386 Cond = Cond.getOperand(0);
19388 // We know the result of AND is compared against zero. Try to match
19390 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19391 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
19392 CC = NewSetCC.getOperand(0);
19393 Cond = NewSetCC.getOperand(1);
19400 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
19401 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19402 Cond = EmitTest(Cond, X86Cond, dl, DAG);
19404 Cond = ConvertCmpIfNecessary(Cond, DAG);
19405 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19406 Chain, Dest, CC, Cond);
19409 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
19410 // Calls to _alloca are needed to probe the stack when allocating more than 4k
19411 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
19412 // that the guard pages used by the OS virtual memory manager are allocated in
19413 // correct sequence.
19415 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
19416 SelectionDAG &DAG) const {
19417 MachineFunction &MF = DAG.getMachineFunction();
19418 bool SplitStack = MF.shouldSplitStack();
19419 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
19420 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
19421 SplitStack || EmitStackProbe;
19425 SDNode *Node = Op.getNode();
19426 SDValue Chain = Op.getOperand(0);
19427 SDValue Size = Op.getOperand(1);
19428 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
19429 EVT VT = Node->getValueType(0);
19431 // Chain the dynamic stack allocation so that it doesn't modify the stack
19432 // pointer when other instructions are using the stack.
19433 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19435 bool Is64Bit = Subtarget.is64Bit();
19436 MVT SPTy = getPointerTy(DAG.getDataLayout());
19440 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19441 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
19442 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
19443 " not tell us which reg is the stack pointer!");
19445 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
19446 Chain = SP.getValue(1);
19447 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19448 unsigned StackAlign = TFI.getStackAlignment();
19449 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
19450 if (Align > StackAlign)
19451 Result = DAG.getNode(ISD::AND, dl, VT, Result,
19452 DAG.getConstant(-(uint64_t)Align, dl, VT));
19453 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
19454 } else if (SplitStack) {
19455 MachineRegisterInfo &MRI = MF.getRegInfo();
19458 // The 64 bit implementation of segmented stacks needs to clobber both r10
19459 // r11. This makes it impossible to use it along with nested parameters.
19460 const Function &F = MF.getFunction();
19461 for (const auto &A : F.args()) {
19462 if (A.hasNestAttr())
19463 report_fatal_error("Cannot use segmented stacks with functions that "
19464 "have nested arguments.");
19468 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
19469 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
19470 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
19471 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
19472 DAG.getRegister(Vreg, SPTy));
19474 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19475 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
19476 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
19478 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19479 unsigned SPReg = RegInfo->getStackRegister();
19480 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
19481 Chain = SP.getValue(1);
19484 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
19485 DAG.getConstant(-(uint64_t)Align, dl, VT));
19486 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
19492 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
19493 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
19495 SDValue Ops[2] = {Result, Chain};
19496 return DAG.getMergeValues(Ops, dl);
19499 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
19500 MachineFunction &MF = DAG.getMachineFunction();
19501 auto PtrVT = getPointerTy(MF.getDataLayout());
19502 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19504 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19507 if (!Subtarget.is64Bit() ||
19508 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
19509 // vastart just stores the address of the VarArgsFrameIndex slot into the
19510 // memory location argument.
19511 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19512 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
19513 MachinePointerInfo(SV));
19517 // gp_offset (0 - 6 * 8)
19518 // fp_offset (48 - 48 + 8 * 16)
19519 // overflow_arg_area (point to parameters coming in memory).
19521 SmallVector<SDValue, 8> MemOps;
19522 SDValue FIN = Op.getOperand(1);
19524 SDValue Store = DAG.getStore(
19525 Op.getOperand(0), DL,
19526 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
19527 MachinePointerInfo(SV));
19528 MemOps.push_back(Store);
19531 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
19532 Store = DAG.getStore(
19533 Op.getOperand(0), DL,
19534 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
19535 MachinePointerInfo(SV, 4));
19536 MemOps.push_back(Store);
19538 // Store ptr to overflow_arg_area
19539 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
19540 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19542 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
19543 MemOps.push_back(Store);
19545 // Store ptr to reg_save_area.
19546 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
19547 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
19548 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
19549 Store = DAG.getStore(
19550 Op.getOperand(0), DL, RSFIN, FIN,
19551 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
19552 MemOps.push_back(Store);
19553 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
19556 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
19557 assert(Subtarget.is64Bit() &&
19558 "LowerVAARG only handles 64-bit va_arg!");
19559 assert(Op.getNumOperands() == 4);
19561 MachineFunction &MF = DAG.getMachineFunction();
19562 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
19563 // The Win64 ABI uses char* instead of a structure.
19564 return DAG.expandVAArg(Op.getNode());
19566 SDValue Chain = Op.getOperand(0);
19567 SDValue SrcPtr = Op.getOperand(1);
19568 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19569 unsigned Align = Op.getConstantOperandVal(3);
19572 EVT ArgVT = Op.getNode()->getValueType(0);
19573 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19574 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
19577 // Decide which area this value should be read from.
19578 // TODO: Implement the AMD64 ABI in its entirety. This simple
19579 // selection mechanism works only for the basic types.
19580 if (ArgVT == MVT::f80) {
19581 llvm_unreachable("va_arg for f80 not yet implemented");
19582 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
19583 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
19584 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
19585 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
19587 llvm_unreachable("Unhandled argument type in LowerVAARG");
19590 if (ArgMode == 2) {
19591 // Sanity Check: Make sure using fp_offset makes sense.
19592 assert(!Subtarget.useSoftFloat() &&
19593 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
19594 Subtarget.hasSSE1());
19597 // Insert VAARG_64 node into the DAG
19598 // VAARG_64 returns two values: Variable Argument Address, Chain
19599 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
19600 DAG.getConstant(ArgMode, dl, MVT::i8),
19601 DAG.getConstant(Align, dl, MVT::i32)};
19602 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
19603 SDValue VAARG = DAG.getMemIntrinsicNode(
19604 X86ISD::VAARG_64, dl,
19605 VTs, InstOps, MVT::i64,
19606 MachinePointerInfo(SV),
19608 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
19609 Chain = VAARG.getValue(1);
19611 // Load the next argument and return it
19612 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
19615 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
19616 SelectionDAG &DAG) {
19617 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
19618 // where a va_list is still an i8*.
19619 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
19620 if (Subtarget.isCallingConvWin64(
19621 DAG.getMachineFunction().getFunction().getCallingConv()))
19622 // Probably a Win64 va_copy.
19623 return DAG.expandVACopy(Op.getNode());
19625 SDValue Chain = Op.getOperand(0);
19626 SDValue DstPtr = Op.getOperand(1);
19627 SDValue SrcPtr = Op.getOperand(2);
19628 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
19629 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
19632 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
19633 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
19635 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
19638 /// Handle vector element shifts where the shift amount is a constant.
19639 /// Takes immediate version of shift as input.
19640 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
19641 SDValue SrcOp, uint64_t ShiftAmt,
19642 SelectionDAG &DAG) {
19643 MVT ElementType = VT.getVectorElementType();
19645 // Bitcast the source vector to the output type, this is mainly necessary for
19646 // vXi8/vXi64 shifts.
19647 if (VT != SrcOp.getSimpleValueType())
19648 SrcOp = DAG.getBitcast(VT, SrcOp);
19650 // Fold this packed shift into its first operand if ShiftAmt is 0.
19654 // Check for ShiftAmt >= element width
19655 if (ShiftAmt >= ElementType.getSizeInBits()) {
19656 if (Opc == X86ISD::VSRAI)
19657 ShiftAmt = ElementType.getSizeInBits() - 1;
19659 return DAG.getConstant(0, dl, VT);
19662 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
19663 && "Unknown target vector shift-by-constant node");
19665 // Fold this packed vector shift into a build vector if SrcOp is a
19666 // vector of Constants or UNDEFs.
19667 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
19668 SmallVector<SDValue, 8> Elts;
19669 unsigned NumElts = SrcOp->getNumOperands();
19670 ConstantSDNode *ND;
19673 default: llvm_unreachable("Unknown opcode!");
19674 case X86ISD::VSHLI:
19675 for (unsigned i=0; i!=NumElts; ++i) {
19676 SDValue CurrentOp = SrcOp->getOperand(i);
19677 if (CurrentOp->isUndef()) {
19678 Elts.push_back(CurrentOp);
19681 ND = cast<ConstantSDNode>(CurrentOp);
19682 const APInt &C = ND->getAPIntValue();
19683 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19686 case X86ISD::VSRLI:
19687 for (unsigned i=0; i!=NumElts; ++i) {
19688 SDValue CurrentOp = SrcOp->getOperand(i);
19689 if (CurrentOp->isUndef()) {
19690 Elts.push_back(CurrentOp);
19693 ND = cast<ConstantSDNode>(CurrentOp);
19694 const APInt &C = ND->getAPIntValue();
19695 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19698 case X86ISD::VSRAI:
19699 for (unsigned i=0; i!=NumElts; ++i) {
19700 SDValue CurrentOp = SrcOp->getOperand(i);
19701 if (CurrentOp->isUndef()) {
19702 Elts.push_back(CurrentOp);
19705 ND = cast<ConstantSDNode>(CurrentOp);
19706 const APInt &C = ND->getAPIntValue();
19707 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19712 return DAG.getBuildVector(VT, dl, Elts);
19715 return DAG.getNode(Opc, dl, VT, SrcOp,
19716 DAG.getConstant(ShiftAmt, dl, MVT::i8));
19719 /// Handle vector element shifts where the shift amount may or may not be a
19720 /// constant. Takes immediate version of shift as input.
19721 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19722 SDValue SrcOp, SDValue ShAmt,
19723 const X86Subtarget &Subtarget,
19724 SelectionDAG &DAG) {
19725 MVT SVT = ShAmt.getSimpleValueType();
19726 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
19728 // Catch shift-by-constant.
19729 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19730 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19731 CShAmt->getZExtValue(), DAG);
19733 // Change opcode to non-immediate version
19735 default: llvm_unreachable("Unknown target vector shift node");
19736 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19737 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19738 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19741 // Need to build a vector containing shift amount.
19742 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19743 // +=================+============+=======================================+
19744 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
19745 // +=================+============+=======================================+
19746 // | i64 | Yes, No | Use ShAmt as lowest elt |
19747 // | i32 | Yes | zero-extend in-reg |
19748 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
19749 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
19750 // +=================+============+=======================================+
19752 if (SVT == MVT::i64)
19753 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19754 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
19755 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
19756 ShAmt = ShAmt.getOperand(0);
19757 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19758 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19759 } else if (Subtarget.hasSSE41() &&
19760 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
19761 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19762 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19764 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
19765 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19766 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19769 // The return type has to be a 128-bit type with the same element
19770 // type as the input type.
19771 MVT EltVT = VT.getVectorElementType();
19772 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19774 ShAmt = DAG.getBitcast(ShVT, ShAmt);
19775 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19778 /// \brief Return Mask with the necessary casting or extending
19779 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
19780 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19781 const X86Subtarget &Subtarget, SelectionDAG &DAG,
19784 if (isAllOnesConstant(Mask))
19785 return DAG.getConstant(1, dl, MaskVT);
19786 if (X86::isZeroNode(Mask))
19787 return DAG.getConstant(0, dl, MaskVT);
19789 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19790 // Mask should be extended
19791 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19792 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19795 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19796 if (MaskVT == MVT::v64i1) {
19797 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19798 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19800 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19801 DAG.getConstant(0, dl, MVT::i32));
19802 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19803 DAG.getConstant(1, dl, MVT::i32));
19805 Lo = DAG.getBitcast(MVT::v32i1, Lo);
19806 Hi = DAG.getBitcast(MVT::v32i1, Hi);
19808 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19810 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19812 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19813 return DAG.getBitcast(MaskVT,
19814 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19818 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19819 Mask.getSimpleValueType().getSizeInBits());
19820 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19821 // are extracted by EXTRACT_SUBVECTOR.
19822 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19823 DAG.getBitcast(BitcastVT, Mask),
19824 DAG.getIntPtrConstant(0, dl));
19828 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19829 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19830 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19831 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19832 SDValue PreservedSrc,
19833 const X86Subtarget &Subtarget,
19834 SelectionDAG &DAG) {
19835 MVT VT = Op.getSimpleValueType();
19836 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19837 unsigned OpcodeSelect = ISD::VSELECT;
19840 if (isAllOnesConstant(Mask))
19843 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19845 switch (Op.getOpcode()) {
19848 case X86ISD::CMPM_RND:
19849 case X86ISD::CMPMU:
19850 case X86ISD::VPSHUFBITQMB:
19851 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19852 case X86ISD::VFPCLASS:
19853 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19854 case X86ISD::VTRUNC:
19855 case X86ISD::VTRUNCS:
19856 case X86ISD::VTRUNCUS:
19857 case X86ISD::CVTPS2PH:
19858 // We can't use ISD::VSELECT here because it is not always "Legal"
19859 // for the destination type. For example vpmovqb require only AVX512
19860 // and vselect that can operate on byte element type require BWI
19861 OpcodeSelect = X86ISD::SELECT;
19864 if (PreservedSrc.isUndef())
19865 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19866 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19869 /// \brief Creates an SDNode for a predicated scalar operation.
19870 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19871 /// The mask is coming as MVT::i8 and it should be transformed
19872 /// to MVT::v1i1 while lowering masking intrinsics.
19873 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19874 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19875 /// for a scalar instruction.
19876 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19877 SDValue PreservedSrc,
19878 const X86Subtarget &Subtarget,
19879 SelectionDAG &DAG) {
19881 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19882 if (MaskConst->getZExtValue() & 0x1)
19885 MVT VT = Op.getSimpleValueType();
19888 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19889 if (Op.getOpcode() == X86ISD::FSETCCM ||
19890 Op.getOpcode() == X86ISD::FSETCCM_RND)
19891 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19892 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19893 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19895 if (PreservedSrc.isUndef())
19896 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19897 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19900 static int getSEHRegistrationNodeSize(const Function *Fn) {
19901 if (!Fn->hasPersonalityFn())
19902 report_fatal_error(
19903 "querying registration node size for function without personality");
19904 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19905 // WinEHStatePass for the full struct definition.
19906 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19907 case EHPersonality::MSVC_X86SEH: return 24;
19908 case EHPersonality::MSVC_CXX: return 16;
19911 report_fatal_error(
19912 "can only recover FP for 32-bit MSVC EH personality functions");
19915 /// When the MSVC runtime transfers control to us, either to an outlined
19916 /// function or when returning to a parent frame after catching an exception, we
19917 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19918 /// Here's the math:
19919 /// RegNodeBase = EntryEBP - RegNodeSize
19920 /// ParentFP = RegNodeBase - ParentFrameOffset
19921 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19922 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19923 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19924 SDValue EntryEBP) {
19925 MachineFunction &MF = DAG.getMachineFunction();
19928 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19929 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19931 // It's possible that the parent function no longer has a personality function
19932 // if the exceptional code was optimized away, in which case we just return
19933 // the incoming EBP.
19934 if (!Fn->hasPersonalityFn())
19937 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19938 // registration, or the .set_setframe offset.
19939 MCSymbol *OffsetSym =
19940 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19941 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19942 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19943 SDValue ParentFrameOffset =
19944 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19946 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19947 // prologue to RBP in the parent function.
19948 const X86Subtarget &Subtarget =
19949 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19950 if (Subtarget.is64Bit())
19951 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19953 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19954 // RegNodeBase = EntryEBP - RegNodeSize
19955 // ParentFP = RegNodeBase - ParentFrameOffset
19956 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19957 DAG.getConstant(RegNodeSize, dl, PtrVT));
19958 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19961 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
19962 SelectionDAG &DAG) const {
19963 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19964 auto isRoundModeCurDirection = [](SDValue Rnd) {
19965 if (!isa<ConstantSDNode>(Rnd))
19968 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19969 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19973 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19974 MVT VT = Op.getSimpleValueType();
19975 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19977 switch(IntrData->Type) {
19978 case INTR_TYPE_1OP:
19979 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19980 case INTR_TYPE_2OP:
19981 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19983 case INTR_TYPE_3OP:
19984 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19985 Op.getOperand(2), Op.getOperand(3));
19986 case INTR_TYPE_4OP:
19987 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19988 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19989 case INTR_TYPE_1OP_MASK_RM: {
19990 SDValue Src = Op.getOperand(1);
19991 SDValue PassThru = Op.getOperand(2);
19992 SDValue Mask = Op.getOperand(3);
19993 SDValue RoundingMode;
19994 // We always add rounding mode to the Node.
19995 // If the rounding mode is not specified, we add the
19996 // "current direction" mode.
19997 if (Op.getNumOperands() == 4)
19999 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20001 RoundingMode = Op.getOperand(4);
20002 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
20003 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20005 Mask, PassThru, Subtarget, DAG);
20007 case INTR_TYPE_1OP_MASK: {
20008 SDValue Src = Op.getOperand(1);
20009 SDValue PassThru = Op.getOperand(2);
20010 SDValue Mask = Op.getOperand(3);
20011 // We add rounding mode to the Node when
20012 // - RM Opcode is specified and
20013 // - RM is not "current direction".
20014 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20015 if (IntrWithRoundingModeOpcode != 0) {
20016 SDValue Rnd = Op.getOperand(4);
20017 if (!isRoundModeCurDirection(Rnd)) {
20018 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20019 dl, Op.getValueType(),
20021 Mask, PassThru, Subtarget, DAG);
20024 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
20025 Mask, PassThru, Subtarget, DAG);
20027 case INTR_TYPE_SCALAR_MASK: {
20028 SDValue Src1 = Op.getOperand(1);
20029 SDValue Src2 = Op.getOperand(2);
20030 SDValue passThru = Op.getOperand(3);
20031 SDValue Mask = Op.getOperand(4);
20032 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20033 // There are 2 kinds of intrinsics in this group:
20034 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20035 // (2) With rounding mode and sae - 7 operands.
20036 bool HasRounding = IntrWithRoundingModeOpcode != 0;
20037 if (Op.getNumOperands() == (5U + HasRounding)) {
20039 SDValue Rnd = Op.getOperand(5);
20040 if (!isRoundModeCurDirection(Rnd))
20041 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20042 dl, VT, Src1, Src2, Rnd),
20043 Mask, passThru, Subtarget, DAG);
20045 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20047 Mask, passThru, Subtarget, DAG);
20050 assert(Op.getNumOperands() == (6U + HasRounding) &&
20051 "Unexpected intrinsic form");
20052 SDValue RoundingMode = Op.getOperand(5);
20054 SDValue Sae = Op.getOperand(6);
20055 if (!isRoundModeCurDirection(Sae))
20056 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20057 dl, VT, Src1, Src2,
20058 RoundingMode, Sae),
20059 Mask, passThru, Subtarget, DAG);
20061 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20062 Src2, RoundingMode),
20063 Mask, passThru, Subtarget, DAG);
20065 case INTR_TYPE_SCALAR_MASK_RM: {
20066 SDValue Src1 = Op.getOperand(1);
20067 SDValue Src2 = Op.getOperand(2);
20068 SDValue Src0 = Op.getOperand(3);
20069 SDValue Mask = Op.getOperand(4);
20070 // There are 2 kinds of intrinsics in this group:
20071 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20072 // (2) With rounding mode and sae - 7 operands.
20073 if (Op.getNumOperands() == 6) {
20074 SDValue Sae = Op.getOperand(5);
20075 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20077 Mask, Src0, Subtarget, DAG);
20079 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
20080 SDValue RoundingMode = Op.getOperand(5);
20081 SDValue Sae = Op.getOperand(6);
20082 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20083 RoundingMode, Sae),
20084 Mask, Src0, Subtarget, DAG);
20086 case INTR_TYPE_2OP_MASK:
20087 case INTR_TYPE_2OP_IMM8_MASK: {
20088 SDValue Src1 = Op.getOperand(1);
20089 SDValue Src2 = Op.getOperand(2);
20090 SDValue PassThru = Op.getOperand(3);
20091 SDValue Mask = Op.getOperand(4);
20093 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
20094 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
20096 // We specify 2 possible opcodes for intrinsics with rounding modes.
20097 // First, we check if the intrinsic may have non-default rounding mode,
20098 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20099 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20100 if (IntrWithRoundingModeOpcode != 0) {
20101 SDValue Rnd = Op.getOperand(5);
20102 if (!isRoundModeCurDirection(Rnd)) {
20103 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20104 dl, Op.getValueType(),
20106 Mask, PassThru, Subtarget, DAG);
20109 // TODO: Intrinsics should have fast-math-flags to propagate.
20110 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
20111 Mask, PassThru, Subtarget, DAG);
20113 case INTR_TYPE_2OP_MASK_RM: {
20114 SDValue Src1 = Op.getOperand(1);
20115 SDValue Src2 = Op.getOperand(2);
20116 SDValue PassThru = Op.getOperand(3);
20117 SDValue Mask = Op.getOperand(4);
20118 // We specify 2 possible modes for intrinsics, with/without rounding
20120 // First, we check if the intrinsic have rounding mode (6 operands),
20121 // if not, we set rounding mode to "current".
20123 if (Op.getNumOperands() == 6)
20124 Rnd = Op.getOperand(5);
20126 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20127 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20129 Mask, PassThru, Subtarget, DAG);
20131 case INTR_TYPE_3OP_SCALAR_MASK: {
20132 SDValue Src1 = Op.getOperand(1);
20133 SDValue Src2 = Op.getOperand(2);
20134 SDValue Src3 = Op.getOperand(3);
20135 SDValue PassThru = Op.getOperand(4);
20136 SDValue Mask = Op.getOperand(5);
20138 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20139 if (IntrWithRoundingModeOpcode != 0) {
20140 SDValue Rnd = Op.getOperand(6);
20141 if (!isRoundModeCurDirection(Rnd))
20142 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20143 dl, VT, Src1, Src2, Src3, Rnd),
20144 Mask, PassThru, Subtarget, DAG);
20146 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20148 Mask, PassThru, Subtarget, DAG);
20150 case INTR_TYPE_3OP_MASK_RM: {
20151 SDValue Src1 = Op.getOperand(1);
20152 SDValue Src2 = Op.getOperand(2);
20153 SDValue Imm = Op.getOperand(3);
20154 SDValue PassThru = Op.getOperand(4);
20155 SDValue Mask = Op.getOperand(5);
20156 // We specify 2 possible modes for intrinsics, with/without rounding
20158 // First, we check if the intrinsic have rounding mode (7 operands),
20159 // if not, we set rounding mode to "current".
20161 if (Op.getNumOperands() == 7)
20162 Rnd = Op.getOperand(6);
20164 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20165 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20166 Src1, Src2, Imm, Rnd),
20167 Mask, PassThru, Subtarget, DAG);
20169 case INTR_TYPE_3OP_IMM8_MASK:
20170 case INTR_TYPE_3OP_MASK: {
20171 SDValue Src1 = Op.getOperand(1);
20172 SDValue Src2 = Op.getOperand(2);
20173 SDValue Src3 = Op.getOperand(3);
20174 SDValue PassThru = Op.getOperand(4);
20175 SDValue Mask = Op.getOperand(5);
20177 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
20178 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
20180 // We specify 2 possible opcodes for intrinsics with rounding modes.
20181 // First, we check if the intrinsic may have non-default rounding mode,
20182 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20183 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20184 if (IntrWithRoundingModeOpcode != 0) {
20185 SDValue Rnd = Op.getOperand(6);
20186 if (!isRoundModeCurDirection(Rnd)) {
20187 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20188 dl, Op.getValueType(),
20189 Src1, Src2, Src3, Rnd),
20190 Mask, PassThru, Subtarget, DAG);
20193 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20195 Mask, PassThru, Subtarget, DAG);
20197 case VPERM_2OP_MASK : {
20198 SDValue Src1 = Op.getOperand(1);
20199 SDValue Src2 = Op.getOperand(2);
20200 SDValue PassThru = Op.getOperand(3);
20201 SDValue Mask = Op.getOperand(4);
20203 // Swap Src1 and Src2 in the node creation
20204 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
20205 Mask, PassThru, Subtarget, DAG);
20207 case VPERM_3OP_MASKZ:
20208 case VPERM_3OP_MASK:{
20209 MVT VT = Op.getSimpleValueType();
20210 // Src2 is the PassThru
20211 SDValue Src1 = Op.getOperand(1);
20212 // PassThru needs to be the same type as the destination in order
20213 // to pattern match correctly.
20214 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
20215 SDValue Src3 = Op.getOperand(3);
20216 SDValue Mask = Op.getOperand(4);
20217 SDValue PassThru = SDValue();
20219 // set PassThru element
20220 if (IntrData->Type == VPERM_3OP_MASKZ)
20221 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20225 // Swap Src1 and Src2 in the node creation
20226 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20227 dl, Op.getValueType(),
20229 Mask, PassThru, Subtarget, DAG);
20233 case FMA_OP_MASK: {
20234 SDValue Src1 = Op.getOperand(1);
20235 SDValue Src2 = Op.getOperand(2);
20236 SDValue Src3 = Op.getOperand(3);
20237 SDValue Mask = Op.getOperand(4);
20238 MVT VT = Op.getSimpleValueType();
20239 SDValue PassThru = SDValue();
20241 // set PassThru element
20242 if (IntrData->Type == FMA_OP_MASKZ)
20243 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20244 else if (IntrData->Type == FMA_OP_MASK3)
20249 // We specify 2 possible opcodes for intrinsics with rounding modes.
20250 // First, we check if the intrinsic may have non-default rounding mode,
20251 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20252 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20253 if (IntrWithRoundingModeOpcode != 0) {
20254 SDValue Rnd = Op.getOperand(5);
20255 if (!isRoundModeCurDirection(Rnd))
20256 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20257 dl, Op.getValueType(),
20258 Src1, Src2, Src3, Rnd),
20259 Mask, PassThru, Subtarget, DAG);
20261 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20262 dl, Op.getValueType(),
20264 Mask, PassThru, Subtarget, DAG);
20266 case FMA_OP_SCALAR_MASK:
20267 case FMA_OP_SCALAR_MASK3:
20268 case FMA_OP_SCALAR_MASKZ: {
20269 SDValue Src1 = Op.getOperand(1);
20270 SDValue Src2 = Op.getOperand(2);
20271 SDValue Src3 = Op.getOperand(3);
20272 SDValue Mask = Op.getOperand(4);
20273 MVT VT = Op.getSimpleValueType();
20274 SDValue PassThru = SDValue();
20276 // set PassThru element
20277 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
20278 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20279 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
20284 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20285 if (IntrWithRoundingModeOpcode != 0) {
20286 SDValue Rnd = Op.getOperand(5);
20287 if (!isRoundModeCurDirection(Rnd))
20288 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
20289 Op.getValueType(), Src1, Src2,
20291 Mask, PassThru, Subtarget, DAG);
20294 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
20295 Op.getValueType(), Src1, Src2,
20297 Mask, PassThru, Subtarget, DAG);
20299 case IFMA_OP_MASKZ:
20300 case IFMA_OP_MASK: {
20301 SDValue Src1 = Op.getOperand(1);
20302 SDValue Src2 = Op.getOperand(2);
20303 SDValue Src3 = Op.getOperand(3);
20304 SDValue Mask = Op.getOperand(4);
20305 MVT VT = Op.getSimpleValueType();
20306 SDValue PassThru = Src1;
20308 // set PassThru element
20309 if (IntrData->Type == IFMA_OP_MASKZ)
20310 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20312 // Node we need to swizzle the operands to pass the multiply operands
20314 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20315 dl, Op.getValueType(),
20317 Mask, PassThru, Subtarget, DAG);
20319 case TERLOG_OP_MASK:
20320 case TERLOG_OP_MASKZ: {
20321 SDValue Src1 = Op.getOperand(1);
20322 SDValue Src2 = Op.getOperand(2);
20323 SDValue Src3 = Op.getOperand(3);
20324 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
20325 SDValue Mask = Op.getOperand(5);
20326 MVT VT = Op.getSimpleValueType();
20327 SDValue PassThru = Src1;
20328 // Set PassThru element.
20329 if (IntrData->Type == TERLOG_OP_MASKZ)
20330 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20332 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20333 Src1, Src2, Src3, Src4),
20334 Mask, PassThru, Subtarget, DAG);
20337 // ISD::FP_ROUND has a second argument that indicates if the truncation
20338 // does not change the value. Set it to 0 since it can change.
20339 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
20340 DAG.getIntPtrConstant(0, dl));
20341 case CVTPD2PS_MASK: {
20342 SDValue Src = Op.getOperand(1);
20343 SDValue PassThru = Op.getOperand(2);
20344 SDValue Mask = Op.getOperand(3);
20345 // We add rounding mode to the Node when
20346 // - RM Opcode is specified and
20347 // - RM is not "current direction".
20348 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20349 if (IntrWithRoundingModeOpcode != 0) {
20350 SDValue Rnd = Op.getOperand(4);
20351 if (!isRoundModeCurDirection(Rnd)) {
20352 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20353 dl, Op.getValueType(),
20355 Mask, PassThru, Subtarget, DAG);
20358 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
20359 // ISD::FP_ROUND has a second argument that indicates if the truncation
20360 // does not change the value. Set it to 0 since it can change.
20361 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20362 DAG.getIntPtrConstant(0, dl)),
20363 Mask, PassThru, Subtarget, DAG);
20366 // FPclass intrinsics with mask
20367 SDValue Src1 = Op.getOperand(1);
20368 MVT VT = Src1.getSimpleValueType();
20369 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20370 SDValue Imm = Op.getOperand(2);
20371 SDValue Mask = Op.getOperand(3);
20372 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20373 Mask.getSimpleValueType().getSizeInBits());
20374 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
20375 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
20377 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20378 DAG.getUNDEF(BitcastVT), FPclassMask,
20379 DAG.getIntPtrConstant(0, dl));
20380 return DAG.getBitcast(Op.getValueType(), Res);
20383 SDValue Src1 = Op.getOperand(1);
20384 SDValue Imm = Op.getOperand(2);
20385 SDValue Mask = Op.getOperand(3);
20386 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
20387 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
20389 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
20390 DAG.getIntPtrConstant(0, dl));
20393 case CMP_MASK_CC: {
20394 // Comparison intrinsics with masks.
20395 // Example of transformation:
20396 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
20397 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
20399 // (v8i1 (insert_subvector undef,
20400 // (v2i1 (and (PCMPEQM %a, %b),
20401 // (extract_subvector
20402 // (v8i1 (bitcast %mask)), 0))), 0))))
20403 MVT VT = Op.getOperand(1).getSimpleValueType();
20404 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20405 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
20406 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20407 Mask.getSimpleValueType().getSizeInBits());
20409 if (IntrData->Type == CMP_MASK_CC) {
20410 SDValue CC = Op.getOperand(3);
20411 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
20412 // We specify 2 possible opcodes for intrinsics with rounding modes.
20413 // First, we check if the intrinsic may have non-default rounding mode,
20414 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20415 if (IntrData->Opc1 != 0) {
20416 SDValue Rnd = Op.getOperand(5);
20417 if (!isRoundModeCurDirection(Rnd))
20418 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
20419 Op.getOperand(2), CC, Rnd);
20421 //default rounding mode
20423 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20424 Op.getOperand(2), CC);
20427 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
20428 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20431 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
20433 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20434 DAG.getUNDEF(BitcastVT), CmpMask,
20435 DAG.getIntPtrConstant(0, dl));
20436 return DAG.getBitcast(Op.getValueType(), Res);
20438 case CMP_MASK_SCALAR_CC: {
20439 SDValue Src1 = Op.getOperand(1);
20440 SDValue Src2 = Op.getOperand(2);
20441 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
20442 SDValue Mask = Op.getOperand(4);
20445 if (IntrData->Opc1 != 0) {
20446 SDValue Rnd = Op.getOperand(5);
20447 if (!isRoundModeCurDirection(Rnd))
20448 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
20450 //default rounding mode
20452 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
20454 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
20456 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask,
20457 DAG.getIntPtrConstant(0, dl));
20459 case COMI: { // Comparison intrinsics
20460 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
20461 SDValue LHS = Op.getOperand(1);
20462 SDValue RHS = Op.getOperand(2);
20463 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
20464 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
20467 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
20468 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
20469 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
20470 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
20473 case ISD::SETNE: { // (ZF = 1 or PF = 1)
20474 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
20475 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
20476 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
20479 case ISD::SETGT: // (CF = 0 and ZF = 0)
20480 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
20482 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
20483 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
20486 case ISD::SETGE: // CF = 0
20487 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
20489 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
20490 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
20493 llvm_unreachable("Unexpected illegal condition!");
20495 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20497 case COMI_RM: { // Comparison intrinsics with Sae
20498 SDValue LHS = Op.getOperand(1);
20499 SDValue RHS = Op.getOperand(2);
20500 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
20501 SDValue Sae = Op.getOperand(4);
20504 if (isRoundModeCurDirection(Sae))
20505 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
20506 DAG.getConstant(CondVal, dl, MVT::i8));
20508 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
20509 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
20510 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp,
20511 DAG.getIntPtrConstant(0, dl));
20514 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
20515 Op.getOperand(1), Op.getOperand(2), Subtarget,
20517 case COMPRESS_EXPAND_IN_REG: {
20518 SDValue Mask = Op.getOperand(3);
20519 SDValue DataToCompress = Op.getOperand(1);
20520 SDValue PassThru = Op.getOperand(2);
20521 if (isAllOnesConstant(Mask)) // return data as is
20522 return Op.getOperand(1);
20524 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20526 Mask, PassThru, Subtarget, DAG);
20529 SDValue Mask = Op.getOperand(1);
20530 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20531 Mask.getSimpleValueType().getSizeInBits());
20532 Mask = DAG.getBitcast(MaskVT, Mask);
20533 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
20536 MVT VT = Op.getSimpleValueType();
20537 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
20539 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
20540 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
20541 // Arguments should be swapped.
20542 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
20543 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
20545 return DAG.getBitcast(VT, Res);
20548 MVT VT = Op.getSimpleValueType();
20549 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
20551 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
20552 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
20553 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
20554 return DAG.getBitcast(VT, Res);
20557 case FIXUPIMMS_MASKZ:
20559 case FIXUPIMM_MASKZ:{
20560 SDValue Src1 = Op.getOperand(1);
20561 SDValue Src2 = Op.getOperand(2);
20562 SDValue Src3 = Op.getOperand(3);
20563 SDValue Imm = Op.getOperand(4);
20564 SDValue Mask = Op.getOperand(5);
20565 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
20566 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
20567 // We specify 2 possible modes for intrinsics, with/without rounding
20569 // First, we check if the intrinsic have rounding mode (7 operands),
20570 // if not, we set rounding mode to "current".
20572 if (Op.getNumOperands() == 7)
20573 Rnd = Op.getOperand(6);
20575 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20576 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
20577 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20578 Src1, Src2, Src3, Imm, Rnd),
20579 Mask, Passthru, Subtarget, DAG);
20580 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
20581 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20582 Src1, Src2, Src3, Imm, Rnd),
20583 Mask, Passthru, Subtarget, DAG);
20585 case CONVERT_TO_MASK: {
20586 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
20587 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
20588 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
20590 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
20592 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20593 DAG.getUNDEF(BitcastVT), CvtMask,
20594 DAG.getIntPtrConstant(0, dl));
20595 return DAG.getBitcast(Op.getValueType(), Res);
20598 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
20599 // Clear the upper bits of the rounding immediate so that the legacy
20600 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20601 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20603 DAG.getConstant(0xf, dl, MVT::i32));
20604 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20605 Op.getOperand(1), RoundingMode);
20608 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
20609 // Clear the upper bits of the rounding immediate so that the legacy
20610 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20611 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20613 DAG.getConstant(0xf, dl, MVT::i32));
20614 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20615 Op.getOperand(1), Op.getOperand(2), RoundingMode);
20623 default: return SDValue(); // Don't custom lower most intrinsics.
20625 case Intrinsic::x86_avx2_permd:
20626 case Intrinsic::x86_avx2_permps:
20627 // Operands intentionally swapped. Mask is last operand to intrinsic,
20628 // but second operand for node/instruction.
20629 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
20630 Op.getOperand(2), Op.getOperand(1));
20632 // ptest and testp intrinsics. The intrinsic these come from are designed to
20633 // return an integer value, not just an instruction so lower it to the ptest
20634 // or testp pattern and a setcc for the result.
20635 case Intrinsic::x86_sse41_ptestz:
20636 case Intrinsic::x86_sse41_ptestc:
20637 case Intrinsic::x86_sse41_ptestnzc:
20638 case Intrinsic::x86_avx_ptestz_256:
20639 case Intrinsic::x86_avx_ptestc_256:
20640 case Intrinsic::x86_avx_ptestnzc_256:
20641 case Intrinsic::x86_avx_vtestz_ps:
20642 case Intrinsic::x86_avx_vtestc_ps:
20643 case Intrinsic::x86_avx_vtestnzc_ps:
20644 case Intrinsic::x86_avx_vtestz_pd:
20645 case Intrinsic::x86_avx_vtestc_pd:
20646 case Intrinsic::x86_avx_vtestnzc_pd:
20647 case Intrinsic::x86_avx_vtestz_ps_256:
20648 case Intrinsic::x86_avx_vtestc_ps_256:
20649 case Intrinsic::x86_avx_vtestnzc_ps_256:
20650 case Intrinsic::x86_avx_vtestz_pd_256:
20651 case Intrinsic::x86_avx_vtestc_pd_256:
20652 case Intrinsic::x86_avx_vtestnzc_pd_256: {
20653 bool IsTestPacked = false;
20654 X86::CondCode X86CC;
20656 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
20657 case Intrinsic::x86_avx_vtestz_ps:
20658 case Intrinsic::x86_avx_vtestz_pd:
20659 case Intrinsic::x86_avx_vtestz_ps_256:
20660 case Intrinsic::x86_avx_vtestz_pd_256:
20661 IsTestPacked = true;
20663 case Intrinsic::x86_sse41_ptestz:
20664 case Intrinsic::x86_avx_ptestz_256:
20666 X86CC = X86::COND_E;
20668 case Intrinsic::x86_avx_vtestc_ps:
20669 case Intrinsic::x86_avx_vtestc_pd:
20670 case Intrinsic::x86_avx_vtestc_ps_256:
20671 case Intrinsic::x86_avx_vtestc_pd_256:
20672 IsTestPacked = true;
20674 case Intrinsic::x86_sse41_ptestc:
20675 case Intrinsic::x86_avx_ptestc_256:
20677 X86CC = X86::COND_B;
20679 case Intrinsic::x86_avx_vtestnzc_ps:
20680 case Intrinsic::x86_avx_vtestnzc_pd:
20681 case Intrinsic::x86_avx_vtestnzc_ps_256:
20682 case Intrinsic::x86_avx_vtestnzc_pd_256:
20683 IsTestPacked = true;
20685 case Intrinsic::x86_sse41_ptestnzc:
20686 case Intrinsic::x86_avx_ptestnzc_256:
20688 X86CC = X86::COND_A;
20692 SDValue LHS = Op.getOperand(1);
20693 SDValue RHS = Op.getOperand(2);
20694 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
20695 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
20696 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20697 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20699 case Intrinsic::x86_avx512_kortestz_w:
20700 case Intrinsic::x86_avx512_kortestc_w: {
20701 X86::CondCode X86CC =
20702 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
20703 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20704 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20705 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
20706 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20707 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20710 case Intrinsic::x86_avx512_knot_w: {
20711 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20712 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
20713 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20714 return DAG.getBitcast(MVT::i16, Res);
20717 case Intrinsic::x86_avx512_kandn_w: {
20718 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20719 // Invert LHS for the not.
20720 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
20721 DAG.getConstant(1, dl, MVT::v16i1));
20722 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20723 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
20724 return DAG.getBitcast(MVT::i16, Res);
20727 case Intrinsic::x86_avx512_kxnor_w: {
20728 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20729 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20730 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20731 // Invert result for the not.
20732 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
20733 DAG.getConstant(1, dl, MVT::v16i1));
20734 return DAG.getBitcast(MVT::i16, Res);
20737 case Intrinsic::x86_sse42_pcmpistria128:
20738 case Intrinsic::x86_sse42_pcmpestria128:
20739 case Intrinsic::x86_sse42_pcmpistric128:
20740 case Intrinsic::x86_sse42_pcmpestric128:
20741 case Intrinsic::x86_sse42_pcmpistrio128:
20742 case Intrinsic::x86_sse42_pcmpestrio128:
20743 case Intrinsic::x86_sse42_pcmpistris128:
20744 case Intrinsic::x86_sse42_pcmpestris128:
20745 case Intrinsic::x86_sse42_pcmpistriz128:
20746 case Intrinsic::x86_sse42_pcmpestriz128: {
20748 X86::CondCode X86CC;
20750 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
20751 case Intrinsic::x86_sse42_pcmpistria128:
20752 Opcode = X86ISD::PCMPISTRI;
20753 X86CC = X86::COND_A;
20755 case Intrinsic::x86_sse42_pcmpestria128:
20756 Opcode = X86ISD::PCMPESTRI;
20757 X86CC = X86::COND_A;
20759 case Intrinsic::x86_sse42_pcmpistric128:
20760 Opcode = X86ISD::PCMPISTRI;
20761 X86CC = X86::COND_B;
20763 case Intrinsic::x86_sse42_pcmpestric128:
20764 Opcode = X86ISD::PCMPESTRI;
20765 X86CC = X86::COND_B;
20767 case Intrinsic::x86_sse42_pcmpistrio128:
20768 Opcode = X86ISD::PCMPISTRI;
20769 X86CC = X86::COND_O;
20771 case Intrinsic::x86_sse42_pcmpestrio128:
20772 Opcode = X86ISD::PCMPESTRI;
20773 X86CC = X86::COND_O;
20775 case Intrinsic::x86_sse42_pcmpistris128:
20776 Opcode = X86ISD::PCMPISTRI;
20777 X86CC = X86::COND_S;
20779 case Intrinsic::x86_sse42_pcmpestris128:
20780 Opcode = X86ISD::PCMPESTRI;
20781 X86CC = X86::COND_S;
20783 case Intrinsic::x86_sse42_pcmpistriz128:
20784 Opcode = X86ISD::PCMPISTRI;
20785 X86CC = X86::COND_E;
20787 case Intrinsic::x86_sse42_pcmpestriz128:
20788 Opcode = X86ISD::PCMPESTRI;
20789 X86CC = X86::COND_E;
20792 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20793 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20794 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20795 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20796 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20799 case Intrinsic::x86_sse42_pcmpistri128:
20800 case Intrinsic::x86_sse42_pcmpestri128: {
20802 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20803 Opcode = X86ISD::PCMPISTRI;
20805 Opcode = X86ISD::PCMPESTRI;
20807 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20808 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20809 return DAG.getNode(Opcode, dl, VTs, NewOps);
20812 case Intrinsic::eh_sjlj_lsda: {
20813 MachineFunction &MF = DAG.getMachineFunction();
20814 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20815 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20816 auto &Context = MF.getMMI().getContext();
20817 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20818 Twine(MF.getFunctionNumber()));
20819 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
20820 DAG.getMCSymbol(S, PtrVT));
20823 case Intrinsic::x86_seh_lsda: {
20824 // Compute the symbol for the LSDA. We know it'll get emitted later.
20825 MachineFunction &MF = DAG.getMachineFunction();
20826 SDValue Op1 = Op.getOperand(1);
20827 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20828 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20829 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20831 // Generate a simple absolute symbol reference. This intrinsic is only
20832 // supported on 32-bit Windows, which isn't PIC.
20833 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20834 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20837 case Intrinsic::x86_seh_recoverfp: {
20838 SDValue FnOp = Op.getOperand(1);
20839 SDValue IncomingFPOp = Op.getOperand(2);
20840 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20841 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20843 report_fatal_error(
20844 "llvm.x86.seh.recoverfp must take a function as the first argument");
20845 return recoverFramePointer(DAG, Fn, IncomingFPOp);
20848 case Intrinsic::localaddress: {
20849 // Returns one of the stack, base, or frame pointer registers, depending on
20850 // which is used to reference local variables.
20851 MachineFunction &MF = DAG.getMachineFunction();
20852 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20854 if (RegInfo->hasBasePointer(MF))
20855 Reg = RegInfo->getBaseRegister();
20856 else // This function handles the SP or FP case.
20857 Reg = RegInfo->getPtrSizedFrameRegister(MF);
20858 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20863 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20864 SDValue Src, SDValue Mask, SDValue Base,
20865 SDValue Index, SDValue ScaleOp, SDValue Chain,
20866 const X86Subtarget &Subtarget) {
20868 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20869 // Scale must be constant.
20872 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20873 EVT MaskVT = Mask.getValueType();
20874 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20875 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20876 SDValue Segment = DAG.getRegister(0, MVT::i32);
20877 // If source is undef or we know it won't be used, use a zero vector
20878 // to break register dependency.
20879 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20880 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20881 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20882 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20883 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20884 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20885 return DAG.getMergeValues(RetOps, dl);
20888 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20889 SDValue Src, SDValue Mask, SDValue Base,
20890 SDValue Index, SDValue ScaleOp, SDValue Chain,
20891 const X86Subtarget &Subtarget) {
20893 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20894 // Scale must be constant.
20897 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20898 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20899 Index.getSimpleValueType().getVectorNumElements());
20901 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20902 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20903 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20904 SDValue Segment = DAG.getRegister(0, MVT::i32);
20905 // If source is undef or we know it won't be used, use a zero vector
20906 // to break register dependency.
20907 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20908 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20909 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20910 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20911 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20912 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20913 return DAG.getMergeValues(RetOps, dl);
20916 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20917 SDValue Src, SDValue Mask, SDValue Base,
20918 SDValue Index, SDValue ScaleOp, SDValue Chain,
20919 const X86Subtarget &Subtarget) {
20921 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20922 // Scale must be constant.
20925 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20926 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20927 SDValue Segment = DAG.getRegister(0, MVT::i32);
20928 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20929 Index.getSimpleValueType().getVectorNumElements());
20931 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20932 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20933 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20934 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20935 return SDValue(Res, 1);
20938 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20939 SDValue Mask, SDValue Base, SDValue Index,
20940 SDValue ScaleOp, SDValue Chain,
20941 const X86Subtarget &Subtarget) {
20943 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20944 // Scale must be constant.
20947 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20948 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20949 SDValue Segment = DAG.getRegister(0, MVT::i32);
20951 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20952 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20953 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20954 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20955 return SDValue(Res, 0);
20958 /// Handles the lowering of builtin intrinsic that return the value
20959 /// of the extended control register.
20960 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20962 const X86Subtarget &Subtarget,
20963 SmallVectorImpl<SDValue> &Results) {
20964 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20965 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20968 // The ECX register is used to select the index of the XCR register to
20971 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20972 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20973 Chain = SDValue(N1, 0);
20975 // Reads the content of XCR and returns it in registers EDX:EAX.
20976 if (Subtarget.is64Bit()) {
20977 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20978 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20981 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20982 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20985 Chain = HI.getValue(1);
20987 if (Subtarget.is64Bit()) {
20988 // Merge the two 32-bit values into a 64-bit one..
20989 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20990 DAG.getConstant(32, DL, MVT::i8));
20991 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20992 Results.push_back(Chain);
20996 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20997 SDValue Ops[] = { LO, HI };
20998 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20999 Results.push_back(Pair);
21000 Results.push_back(Chain);
21003 /// Handles the lowering of builtin intrinsics that read performance monitor
21004 /// counters (x86_rdpmc).
21005 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
21007 const X86Subtarget &Subtarget,
21008 SmallVectorImpl<SDValue> &Results) {
21009 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21010 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21013 // The ECX register is used to select the index of the performance counter
21015 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
21017 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
21019 // Reads the content of a 64-bit performance counter and returns it in the
21020 // registers EDX:EAX.
21021 if (Subtarget.is64Bit()) {
21022 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21023 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21026 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21027 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21030 Chain = HI.getValue(1);
21032 if (Subtarget.is64Bit()) {
21033 // The EAX register is loaded with the low-order 32 bits. The EDX register
21034 // is loaded with the supported high-order bits of the counter.
21035 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21036 DAG.getConstant(32, DL, MVT::i8));
21037 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21038 Results.push_back(Chain);
21042 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21043 SDValue Ops[] = { LO, HI };
21044 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21045 Results.push_back(Pair);
21046 Results.push_back(Chain);
21049 /// Handles the lowering of builtin intrinsics that read the time stamp counter
21050 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
21051 /// READCYCLECOUNTER nodes.
21052 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
21054 const X86Subtarget &Subtarget,
21055 SmallVectorImpl<SDValue> &Results) {
21056 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21057 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
21060 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
21061 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
21062 // and the EAX register is loaded with the low-order 32 bits.
21063 if (Subtarget.is64Bit()) {
21064 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21065 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21068 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21069 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21072 SDValue Chain = HI.getValue(1);
21074 if (Opcode == X86ISD::RDTSCP_DAG) {
21075 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21077 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
21078 // the ECX register. Add 'ecx' explicitly to the chain.
21079 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
21081 // Explicitly store the content of ECX at the location passed in input
21082 // to the 'rdtscp' intrinsic.
21083 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
21084 MachinePointerInfo());
21087 if (Subtarget.is64Bit()) {
21088 // The EDX register is loaded with the high-order 32 bits of the MSR, and
21089 // the EAX register is loaded with the low-order 32 bits.
21090 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21091 DAG.getConstant(32, DL, MVT::i8));
21092 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21093 Results.push_back(Chain);
21097 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21098 SDValue Ops[] = { LO, HI };
21099 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21100 Results.push_back(Pair);
21101 Results.push_back(Chain);
21104 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
21105 SelectionDAG &DAG) {
21106 SmallVector<SDValue, 2> Results;
21108 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
21110 return DAG.getMergeValues(Results, DL);
21113 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
21114 MachineFunction &MF = DAG.getMachineFunction();
21115 SDValue Chain = Op.getOperand(0);
21116 SDValue RegNode = Op.getOperand(2);
21117 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21119 report_fatal_error("EH registrations only live in functions using WinEH");
21121 // Cast the operand to an alloca, and remember the frame index.
21122 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
21124 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
21125 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
21127 // Return the chain operand without making any DAG nodes.
21131 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
21132 MachineFunction &MF = DAG.getMachineFunction();
21133 SDValue Chain = Op.getOperand(0);
21134 SDValue EHGuard = Op.getOperand(2);
21135 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21137 report_fatal_error("EHGuard only live in functions using WinEH");
21139 // Cast the operand to an alloca, and remember the frame index.
21140 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
21142 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
21143 EHInfo->EHGuardFrameIndex = FINode->getIndex();
21145 // Return the chain operand without making any DAG nodes.
21149 /// Emit Truncating Store with signed or unsigned saturation.
21151 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
21152 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
21153 SelectionDAG &DAG) {
21155 SDVTList VTs = DAG.getVTList(MVT::Other);
21156 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
21157 SDValue Ops[] = { Chain, Val, Ptr, Undef };
21159 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21160 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21163 /// Emit Masked Truncating Store with signed or unsigned saturation.
21165 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
21166 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
21167 MachineMemOperand *MMO, SelectionDAG &DAG) {
21169 SDVTList VTs = DAG.getVTList(MVT::Other);
21170 SDValue Ops[] = { Chain, Ptr, Mask, Val };
21172 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21173 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21176 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
21177 SelectionDAG &DAG) {
21178 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
21180 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
21183 case llvm::Intrinsic::x86_seh_ehregnode:
21184 return MarkEHRegistrationNode(Op, DAG);
21185 case llvm::Intrinsic::x86_seh_ehguard:
21186 return MarkEHGuard(Op, DAG);
21187 case llvm::Intrinsic::x86_flags_read_u32:
21188 case llvm::Intrinsic::x86_flags_read_u64:
21189 case llvm::Intrinsic::x86_flags_write_u32:
21190 case llvm::Intrinsic::x86_flags_write_u64: {
21191 // We need a frame pointer because this will get lowered to a PUSH/POP
21193 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21194 MFI.setHasCopyImplyingStackAdjustment(true);
21195 // Don't do anything here, we will expand these intrinsics out later
21196 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
21199 case Intrinsic::x86_lwpins32:
21200 case Intrinsic::x86_lwpins64: {
21202 SDValue Chain = Op->getOperand(0);
21203 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
21205 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
21206 Op->getOperand(3), Op->getOperand(4));
21207 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
21208 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
21209 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
21210 LwpIns.getValue(1));
21217 switch(IntrData->Type) {
21218 default: llvm_unreachable("Unknown Intrinsic Type");
21221 // Emit the node with the right value type.
21222 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
21223 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21225 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
21226 // Otherwise return the value from Rand, which is always 0, casted to i32.
21227 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
21228 DAG.getConstant(1, dl, Op->getValueType(1)),
21229 DAG.getConstant(X86::COND_B, dl, MVT::i8),
21230 SDValue(Result.getNode(), 1) };
21231 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
21233 // Return { result, isValid, chain }.
21234 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
21235 SDValue(Result.getNode(), 2));
21237 case GATHER_AVX2: {
21238 SDValue Chain = Op.getOperand(0);
21239 SDValue Src = Op.getOperand(2);
21240 SDValue Base = Op.getOperand(3);
21241 SDValue Index = Op.getOperand(4);
21242 SDValue Mask = Op.getOperand(5);
21243 SDValue Scale = Op.getOperand(6);
21244 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21245 Scale, Chain, Subtarget);
21248 //gather(v1, mask, index, base, scale);
21249 SDValue Chain = Op.getOperand(0);
21250 SDValue Src = Op.getOperand(2);
21251 SDValue Base = Op.getOperand(3);
21252 SDValue Index = Op.getOperand(4);
21253 SDValue Mask = Op.getOperand(5);
21254 SDValue Scale = Op.getOperand(6);
21255 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
21259 //scatter(base, mask, index, v1, scale);
21260 SDValue Chain = Op.getOperand(0);
21261 SDValue Base = Op.getOperand(2);
21262 SDValue Mask = Op.getOperand(3);
21263 SDValue Index = Op.getOperand(4);
21264 SDValue Src = Op.getOperand(5);
21265 SDValue Scale = Op.getOperand(6);
21266 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21267 Scale, Chain, Subtarget);
21270 SDValue Hint = Op.getOperand(6);
21271 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
21272 assert((HintVal == 2 || HintVal == 3) &&
21273 "Wrong prefetch hint in intrinsic: should be 2 or 3");
21274 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
21275 SDValue Chain = Op.getOperand(0);
21276 SDValue Mask = Op.getOperand(2);
21277 SDValue Index = Op.getOperand(3);
21278 SDValue Base = Op.getOperand(4);
21279 SDValue Scale = Op.getOperand(5);
21280 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
21283 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
21285 SmallVector<SDValue, 2> Results;
21286 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
21288 return DAG.getMergeValues(Results, dl);
21290 // Read Performance Monitoring Counters.
21292 SmallVector<SDValue, 2> Results;
21293 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
21294 return DAG.getMergeValues(Results, dl);
21296 // Get Extended Control Register.
21298 SmallVector<SDValue, 2> Results;
21299 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
21300 return DAG.getMergeValues(Results, dl);
21302 // XTEST intrinsics.
21304 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
21305 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21307 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
21308 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
21309 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
21310 Ret, SDValue(InTrans.getNode(), 1));
21314 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
21315 SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
21316 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
21317 DAG.getConstant(-1, dl, MVT::i8));
21318 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
21319 Op.getOperand(4), GenCF.getValue(1));
21320 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
21321 Op.getOperand(5), MachinePointerInfo());
21322 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
21323 SDValue Results[] = { SetCC, Store };
21324 return DAG.getMergeValues(Results, dl);
21326 case COMPRESS_TO_MEM: {
21327 SDValue Mask = Op.getOperand(4);
21328 SDValue DataToCompress = Op.getOperand(3);
21329 SDValue Addr = Op.getOperand(2);
21330 SDValue Chain = Op.getOperand(0);
21331 MVT VT = DataToCompress.getSimpleValueType();
21333 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21334 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21336 if (isAllOnesConstant(Mask)) // return just a store
21337 return DAG.getStore(Chain, dl, DataToCompress, Addr,
21338 MemIntr->getMemOperand());
21340 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21341 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21343 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
21344 MemIntr->getMemOperand(),
21345 false /* truncating */, true /* compressing */);
21347 case TRUNCATE_TO_MEM_VI8:
21348 case TRUNCATE_TO_MEM_VI16:
21349 case TRUNCATE_TO_MEM_VI32: {
21350 SDValue Mask = Op.getOperand(4);
21351 SDValue DataToTruncate = Op.getOperand(3);
21352 SDValue Addr = Op.getOperand(2);
21353 SDValue Chain = Op.getOperand(0);
21355 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21356 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21358 EVT MemVT = MemIntr->getMemoryVT();
21360 uint16_t TruncationOp = IntrData->Opc0;
21361 switch (TruncationOp) {
21362 case X86ISD::VTRUNC: {
21363 if (isAllOnesConstant(Mask)) // return just a truncate store
21364 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
21365 MemIntr->getMemOperand());
21367 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21368 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21370 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
21371 MemIntr->getMemOperand(), true /* truncating */);
21373 case X86ISD::VTRUNCUS:
21374 case X86ISD::VTRUNCS: {
21375 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
21376 if (isAllOnesConstant(Mask))
21377 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
21378 MemIntr->getMemOperand(), DAG);
21380 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21381 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21383 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
21384 VMask, MemVT, MemIntr->getMemOperand(), DAG);
21387 llvm_unreachable("Unsupported truncstore intrinsic");
21391 case EXPAND_FROM_MEM: {
21392 SDValue Mask = Op.getOperand(4);
21393 SDValue PassThru = Op.getOperand(3);
21394 SDValue Addr = Op.getOperand(2);
21395 SDValue Chain = Op.getOperand(0);
21396 MVT VT = Op.getSimpleValueType();
21398 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21399 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21401 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
21402 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
21403 if (X86::isZeroNode(Mask))
21404 return DAG.getUNDEF(VT);
21406 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21407 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21408 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
21409 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
21410 true /* expanding */);
21415 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
21416 SelectionDAG &DAG) const {
21417 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21418 MFI.setReturnAddressIsTaken(true);
21420 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
21423 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21425 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21428 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
21429 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21430 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
21431 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
21432 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
21433 MachinePointerInfo());
21436 // Just load the return address.
21437 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
21438 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
21439 MachinePointerInfo());
21442 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
21443 SelectionDAG &DAG) const {
21444 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
21445 return getReturnAddressFrameIndex(DAG);
21448 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
21449 MachineFunction &MF = DAG.getMachineFunction();
21450 MachineFrameInfo &MFI = MF.getFrameInfo();
21451 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21452 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21453 EVT VT = Op.getValueType();
21455 MFI.setFrameAddressIsTaken(true);
21457 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
21458 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
21459 // is not possible to crawl up the stack without looking at the unwind codes
21461 int FrameAddrIndex = FuncInfo->getFAIndex();
21462 if (!FrameAddrIndex) {
21463 // Set up a frame object for the return address.
21464 unsigned SlotSize = RegInfo->getSlotSize();
21465 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
21466 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
21467 FuncInfo->setFAIndex(FrameAddrIndex);
21469 return DAG.getFrameIndex(FrameAddrIndex, VT);
21472 unsigned FrameReg =
21473 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21474 SDLoc dl(Op); // FIXME probably not meaningful
21475 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21476 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
21477 (FrameReg == X86::EBP && VT == MVT::i32)) &&
21478 "Invalid Frame Register!");
21479 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
21481 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
21482 MachinePointerInfo());
21486 // FIXME? Maybe this could be a TableGen attribute on some registers and
21487 // this table could be generated automatically from RegInfo.
21488 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
21489 SelectionDAG &DAG) const {
21490 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21491 const MachineFunction &MF = DAG.getMachineFunction();
21493 unsigned Reg = StringSwitch<unsigned>(RegName)
21494 .Case("esp", X86::ESP)
21495 .Case("rsp", X86::RSP)
21496 .Case("ebp", X86::EBP)
21497 .Case("rbp", X86::RBP)
21500 if (Reg == X86::EBP || Reg == X86::RBP) {
21501 if (!TFI.hasFP(MF))
21502 report_fatal_error("register " + StringRef(RegName) +
21503 " is allocatable: function has no frame pointer");
21506 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21507 unsigned FrameReg =
21508 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21509 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
21510 "Invalid Frame Register!");
21518 report_fatal_error("Invalid register name global variable");
21521 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
21522 SelectionDAG &DAG) const {
21523 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21524 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
21527 unsigned X86TargetLowering::getExceptionPointerRegister(
21528 const Constant *PersonalityFn) const {
21529 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
21530 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21532 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
21535 unsigned X86TargetLowering::getExceptionSelectorRegister(
21536 const Constant *PersonalityFn) const {
21537 // Funclet personalities don't use selectors (the runtime does the selection).
21538 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
21539 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21542 bool X86TargetLowering::needsFixedCatchObjects() const {
21543 return Subtarget.isTargetWin64();
21546 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
21547 SDValue Chain = Op.getOperand(0);
21548 SDValue Offset = Op.getOperand(1);
21549 SDValue Handler = Op.getOperand(2);
21552 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21553 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21554 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
21555 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
21556 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
21557 "Invalid Frame Register!");
21558 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
21559 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
21561 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
21562 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
21564 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
21565 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
21566 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
21568 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
21569 DAG.getRegister(StoreAddrReg, PtrVT));
21572 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
21573 SelectionDAG &DAG) const {
21575 // If the subtarget is not 64bit, we may need the global base reg
21576 // after isel expand pseudo, i.e., after CGBR pass ran.
21577 // Therefore, ask for the GlobalBaseReg now, so that the pass
21578 // inserts the code for us in case we need it.
21579 // Otherwise, we will end up in a situation where we will
21580 // reference a virtual register that is not defined!
21581 if (!Subtarget.is64Bit()) {
21582 const X86InstrInfo *TII = Subtarget.getInstrInfo();
21583 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
21585 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
21586 DAG.getVTList(MVT::i32, MVT::Other),
21587 Op.getOperand(0), Op.getOperand(1));
21590 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
21591 SelectionDAG &DAG) const {
21593 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
21594 Op.getOperand(0), Op.getOperand(1));
21597 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
21598 SelectionDAG &DAG) const {
21600 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
21604 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
21605 return Op.getOperand(0);
21608 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
21609 SelectionDAG &DAG) const {
21610 SDValue Root = Op.getOperand(0);
21611 SDValue Trmp = Op.getOperand(1); // trampoline
21612 SDValue FPtr = Op.getOperand(2); // nested function
21613 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
21616 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21617 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
21619 if (Subtarget.is64Bit()) {
21620 SDValue OutChains[6];
21622 // Large code-model.
21623 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
21624 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
21626 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
21627 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21629 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21631 // Load the pointer to the nested function into R11.
21632 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21633 SDValue Addr = Trmp;
21634 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21635 Addr, MachinePointerInfo(TrmpAddr));
21637 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21638 DAG.getConstant(2, dl, MVT::i64));
21640 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21641 /* Alignment = */ 2);
21643 // Load the 'nest' parameter value into R10.
21644 // R10 is specified in X86CallingConv.td
21645 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21646 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21647 DAG.getConstant(10, dl, MVT::i64));
21648 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21649 Addr, MachinePointerInfo(TrmpAddr, 10));
21651 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21652 DAG.getConstant(12, dl, MVT::i64));
21654 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21655 /* Alignment = */ 2);
21657 // Jump to the nested function.
21658 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21659 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21660 DAG.getConstant(20, dl, MVT::i64));
21661 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21662 Addr, MachinePointerInfo(TrmpAddr, 20));
21664 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21665 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21666 DAG.getConstant(22, dl, MVT::i64));
21667 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21668 Addr, MachinePointerInfo(TrmpAddr, 22));
21670 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21672 const Function *Func =
21673 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21674 CallingConv::ID CC = Func->getCallingConv();
21679 llvm_unreachable("Unsupported calling convention");
21680 case CallingConv::C:
21681 case CallingConv::X86_StdCall: {
21682 // Pass 'nest' parameter in ECX.
21683 // Must be kept in sync with X86CallingConv.td
21684 NestReg = X86::ECX;
21686 // Check that ECX wasn't needed by an 'inreg' parameter.
21687 FunctionType *FTy = Func->getFunctionType();
21688 const AttributeList &Attrs = Func->getAttributes();
21690 if (!Attrs.isEmpty() && !Func->isVarArg()) {
21691 unsigned InRegCount = 0;
21694 for (FunctionType::param_iterator I = FTy->param_begin(),
21695 E = FTy->param_end(); I != E; ++I, ++Idx)
21696 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21697 auto &DL = DAG.getDataLayout();
21698 // FIXME: should only count parameters that are lowered to integers.
21699 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21702 if (InRegCount > 2) {
21703 report_fatal_error("Nest register in use - reduce number of inreg"
21709 case CallingConv::X86_FastCall:
21710 case CallingConv::X86_ThisCall:
21711 case CallingConv::Fast:
21712 // Pass 'nest' parameter in EAX.
21713 // Must be kept in sync with X86CallingConv.td
21714 NestReg = X86::EAX;
21718 SDValue OutChains[4];
21719 SDValue Addr, Disp;
21721 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21722 DAG.getConstant(10, dl, MVT::i32));
21723 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21725 // This is storing the opcode for MOV32ri.
21726 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21727 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21729 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21730 Trmp, MachinePointerInfo(TrmpAddr));
21732 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21733 DAG.getConstant(1, dl, MVT::i32));
21735 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21736 /* Alignment = */ 1);
21738 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21739 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21740 DAG.getConstant(5, dl, MVT::i32));
21741 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21742 Addr, MachinePointerInfo(TrmpAddr, 5),
21743 /* Alignment = */ 1);
21745 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21746 DAG.getConstant(6, dl, MVT::i32));
21748 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21749 /* Alignment = */ 1);
21751 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21755 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21756 SelectionDAG &DAG) const {
21758 The rounding mode is in bits 11:10 of FPSR, and has the following
21760 00 Round to nearest
21765 FLT_ROUNDS, on the other hand, expects the following:
21772 To perform the conversion, we do:
21773 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21776 MachineFunction &MF = DAG.getMachineFunction();
21777 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21778 unsigned StackAlignment = TFI.getStackAlignment();
21779 MVT VT = Op.getSimpleValueType();
21782 // Save FP Control Word to stack slot
21783 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21784 SDValue StackSlot =
21785 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21787 MachineMemOperand *MMO =
21788 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21789 MachineMemOperand::MOStore, 2, 2);
21791 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21792 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21793 DAG.getVTList(MVT::Other),
21794 Ops, MVT::i16, MMO);
21796 // Load FP Control Word from stack slot
21798 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21800 // Transform as necessary
21802 DAG.getNode(ISD::SRL, DL, MVT::i16,
21803 DAG.getNode(ISD::AND, DL, MVT::i16,
21804 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21805 DAG.getConstant(11, DL, MVT::i8));
21807 DAG.getNode(ISD::SRL, DL, MVT::i16,
21808 DAG.getNode(ISD::AND, DL, MVT::i16,
21809 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21810 DAG.getConstant(9, DL, MVT::i8));
21813 DAG.getNode(ISD::AND, DL, MVT::i16,
21814 DAG.getNode(ISD::ADD, DL, MVT::i16,
21815 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21816 DAG.getConstant(1, DL, MVT::i16)),
21817 DAG.getConstant(3, DL, MVT::i16));
21819 return DAG.getNode((VT.getSizeInBits() < 16 ?
21820 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
21823 // Split an unary integer op into 2 half sized ops.
21824 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21825 MVT VT = Op.getSimpleValueType();
21826 unsigned NumElems = VT.getVectorNumElements();
21827 unsigned SizeInBits = VT.getSizeInBits();
21829 // Extract the Lo/Hi vectors
21831 SDValue Src = Op.getOperand(0);
21832 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21833 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21835 MVT EltVT = VT.getVectorElementType();
21836 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21837 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21838 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21839 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21842 // Decompose 256-bit ops into smaller 128-bit ops.
21843 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21844 assert(Op.getSimpleValueType().is256BitVector() &&
21845 Op.getSimpleValueType().isInteger() &&
21846 "Only handle AVX 256-bit vector integer operation");
21847 return LowerVectorIntUnary(Op, DAG);
21850 // Decompose 512-bit ops into smaller 256-bit ops.
21851 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21852 assert(Op.getSimpleValueType().is512BitVector() &&
21853 Op.getSimpleValueType().isInteger() &&
21854 "Only handle AVX 512-bit vector integer operation");
21855 return LowerVectorIntUnary(Op, DAG);
21858 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21860 // i8/i16 vector implemented using dword LZCNT vector instruction
21861 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21862 // split the vector, perform operation on it's Lo a Hi part and
21863 // concatenate the results.
21864 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21865 assert(Op.getOpcode() == ISD::CTLZ);
21867 MVT VT = Op.getSimpleValueType();
21868 MVT EltVT = VT.getVectorElementType();
21869 unsigned NumElems = VT.getVectorNumElements();
21871 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21872 "Unsupported element type");
21874 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21876 return LowerVectorIntUnary(Op, DAG);
21878 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21879 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21880 "Unsupported value type for operation");
21882 // Use native supported vector instruction vplzcntd.
21883 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21884 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21885 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21886 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21888 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21891 // Lower CTLZ using a PSHUFB lookup table implementation.
21892 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21893 const X86Subtarget &Subtarget,
21894 SelectionDAG &DAG) {
21895 MVT VT = Op.getSimpleValueType();
21896 int NumElts = VT.getVectorNumElements();
21897 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21898 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21900 // Per-nibble leading zero PSHUFB lookup table.
21901 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21902 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21903 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21904 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21906 SmallVector<SDValue, 64> LUTVec;
21907 for (int i = 0; i < NumBytes; ++i)
21908 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21909 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21911 // Begin by bitcasting the input to byte vector, then split those bytes
21912 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21913 // If the hi input nibble is zero then we add both results together, otherwise
21914 // we just take the hi result (by masking the lo result to zero before the
21916 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21917 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21919 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21920 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21921 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21922 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21924 if (CurrVT.is512BitVector()) {
21925 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
21926 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
21927 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
21929 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21932 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21933 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21934 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21935 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21937 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21938 // of the current vector width in the same way we did for the nibbles.
21939 // If the upper half of the input element is zero then add the halves'
21940 // leading zero counts together, otherwise just use the upper half's.
21941 // Double the width of the result until we are at target width.
21942 while (CurrVT != VT) {
21943 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21944 int CurrNumElts = CurrVT.getVectorNumElements();
21945 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21946 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21947 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21949 // Check if the upper half of the input element is zero.
21950 if (CurrVT.is512BitVector()) {
21951 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
21952 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
21953 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21954 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
21956 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21957 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21959 HiZ = DAG.getBitcast(NextVT, HiZ);
21961 // Move the upper/lower halves to the lower bits as we'll be extending to
21962 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21964 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21965 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21966 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21967 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21968 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21975 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21976 const X86Subtarget &Subtarget,
21977 SelectionDAG &DAG) {
21978 MVT VT = Op.getSimpleValueType();
21980 if (Subtarget.hasCDI())
21981 return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21983 // Decompose 256-bit ops into smaller 128-bit ops.
21984 if (VT.is256BitVector() && !Subtarget.hasInt256())
21985 return Lower256IntUnary(Op, DAG);
21987 // Decompose 512-bit ops into smaller 256-bit ops.
21988 if (VT.is512BitVector() && !Subtarget.hasBWI())
21989 return Lower512IntUnary(Op, DAG);
21991 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21992 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21995 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21996 SelectionDAG &DAG) {
21997 MVT VT = Op.getSimpleValueType();
21999 unsigned NumBits = VT.getSizeInBits();
22001 unsigned Opc = Op.getOpcode();
22004 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
22006 Op = Op.getOperand(0);
22007 if (VT == MVT::i8) {
22008 // Zero extend to i32 since there is not an i8 bsr.
22010 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
22013 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
22014 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
22015 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
22017 if (Opc == ISD::CTLZ) {
22018 // If src is zero (i.e. bsr sets ZF), returns NumBits.
22021 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
22022 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22025 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
22028 // Finally xor with NumBits-1.
22029 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
22030 DAG.getConstant(NumBits - 1, dl, OpVT));
22033 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
22037 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
22038 MVT VT = Op.getSimpleValueType();
22039 unsigned NumBits = VT.getScalarSizeInBits();
22042 if (VT.isVector()) {
22043 SDValue N0 = Op.getOperand(0);
22044 SDValue Zero = DAG.getConstant(0, dl, VT);
22046 // lsb(x) = (x & -x)
22047 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
22048 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
22050 // cttz_undef(x) = (width - 1) - ctlz(lsb)
22051 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
22052 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
22053 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
22054 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
22057 // cttz(x) = ctpop(lsb - 1)
22058 SDValue One = DAG.getConstant(1, dl, VT);
22059 return DAG.getNode(ISD::CTPOP, dl, VT,
22060 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
22063 assert(Op.getOpcode() == ISD::CTTZ &&
22064 "Only scalar CTTZ requires custom lowering");
22066 // Issue a bsf (scan bits forward) which also sets EFLAGS.
22067 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22068 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
22070 // If src is zero (i.e. bsf sets ZF), returns NumBits.
22073 DAG.getConstant(NumBits, dl, VT),
22074 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22077 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
22080 /// Break a 256-bit integer operation into two new 128-bit ones and then
22081 /// concatenate the result back.
22082 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
22083 MVT VT = Op.getSimpleValueType();
22085 assert(VT.is256BitVector() && VT.isInteger() &&
22086 "Unsupported value type for operation");
22088 unsigned NumElems = VT.getVectorNumElements();
22091 // Extract the LHS vectors
22092 SDValue LHS = Op.getOperand(0);
22093 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
22094 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
22096 // Extract the RHS vectors
22097 SDValue RHS = Op.getOperand(1);
22098 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
22099 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
22101 MVT EltVT = VT.getVectorElementType();
22102 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22104 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22105 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22106 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22109 /// Break a 512-bit integer operation into two new 256-bit ones and then
22110 /// concatenate the result back.
22111 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
22112 MVT VT = Op.getSimpleValueType();
22114 assert(VT.is512BitVector() && VT.isInteger() &&
22115 "Unsupported value type for operation");
22117 unsigned NumElems = VT.getVectorNumElements();
22120 // Extract the LHS vectors
22121 SDValue LHS = Op.getOperand(0);
22122 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
22123 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
22125 // Extract the RHS vectors
22126 SDValue RHS = Op.getOperand(1);
22127 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
22128 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
22130 MVT EltVT = VT.getVectorElementType();
22131 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22133 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22134 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22135 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22138 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
22139 MVT VT = Op.getSimpleValueType();
22140 if (VT.getScalarType() == MVT::i1)
22141 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
22142 Op.getOperand(0), Op.getOperand(1));
22143 assert(Op.getSimpleValueType().is256BitVector() &&
22144 Op.getSimpleValueType().isInteger() &&
22145 "Only handle AVX 256-bit vector integer operation");
22146 return Lower256IntArith(Op, DAG);
22149 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
22150 MVT VT = Op.getSimpleValueType();
22151 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
22152 // Since X86 does not have CMOV for 8-bit integer, we don't convert
22153 // 8-bit integer abs to NEG and CMOV.
22155 SDValue N0 = Op.getOperand(0);
22156 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
22157 DAG.getConstant(0, DL, VT), N0);
22158 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
22159 SDValue(Neg.getNode(), 1)};
22160 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
22163 assert(Op.getSimpleValueType().is256BitVector() &&
22164 Op.getSimpleValueType().isInteger() &&
22165 "Only handle AVX 256-bit vector integer operation");
22166 return Lower256IntUnary(Op, DAG);
22169 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
22170 assert(Op.getSimpleValueType().is256BitVector() &&
22171 Op.getSimpleValueType().isInteger() &&
22172 "Only handle AVX 256-bit vector integer operation");
22173 return Lower256IntArith(Op, DAG);
22176 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
22177 SelectionDAG &DAG) {
22179 MVT VT = Op.getSimpleValueType();
22181 if (VT.getScalarType() == MVT::i1)
22182 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
22184 // Decompose 256-bit ops into smaller 128-bit ops.
22185 if (VT.is256BitVector() && !Subtarget.hasInt256())
22186 return Lower256IntArith(Op, DAG);
22188 SDValue A = Op.getOperand(0);
22189 SDValue B = Op.getOperand(1);
22191 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
22192 // vector pairs, multiply and truncate.
22193 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
22194 if (Subtarget.hasInt256()) {
22195 // For 512-bit vectors, split into 256-bit vectors to allow the
22196 // sign-extension to occur.
22197 if (VT == MVT::v64i8)
22198 return Lower512IntArith(Op, DAG);
22200 // For 256-bit vectors, split into 128-bit vectors to allow the
22201 // sign-extension to occur. We don't need this on AVX512BW as we can
22202 // safely sign-extend to v32i16.
22203 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
22204 return Lower256IntArith(Op, DAG);
22206 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
22207 return DAG.getNode(
22208 ISD::TRUNCATE, dl, VT,
22209 DAG.getNode(ISD::MUL, dl, ExVT,
22210 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
22211 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
22214 assert(VT == MVT::v16i8 &&
22215 "Pre-AVX2 support only supports v16i8 multiplication");
22216 MVT ExVT = MVT::v8i16;
22218 // Extract the lo parts and sign extend to i16
22220 if (Subtarget.hasSSE41()) {
22221 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
22222 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
22224 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22225 -1, 4, -1, 5, -1, 6, -1, 7};
22226 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22227 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22228 ALo = DAG.getBitcast(ExVT, ALo);
22229 BLo = DAG.getBitcast(ExVT, BLo);
22230 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22231 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22234 // Extract the hi parts and sign extend to i16
22236 if (Subtarget.hasSSE41()) {
22237 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22238 -1, -1, -1, -1, -1, -1, -1, -1};
22239 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22240 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22241 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
22242 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
22244 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22245 -1, 12, -1, 13, -1, 14, -1, 15};
22246 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22247 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22248 AHi = DAG.getBitcast(ExVT, AHi);
22249 BHi = DAG.getBitcast(ExVT, BHi);
22250 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22251 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22254 // Multiply, mask the lower 8bits of the lo/hi results and pack
22255 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22256 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22257 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
22258 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
22259 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22262 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
22263 if (VT == MVT::v4i32) {
22264 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
22265 "Should not custom lower when pmulld is available!");
22267 // If the upper 17 bits of each element are zero then we can use PMADD.
22268 APInt Mask17 = APInt::getHighBitsSet(32, 17);
22269 if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17))
22270 return DAG.getNode(X86ISD::VPMADDWD, dl, VT,
22271 DAG.getBitcast(MVT::v8i16, A),
22272 DAG.getBitcast(MVT::v8i16, B));
22274 // Extract the odd parts.
22275 static const int UnpackMask[] = { 1, -1, 3, -1 };
22276 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
22277 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
22279 // Multiply the even parts.
22280 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
22281 // Now multiply odd parts.
22282 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
22284 Evens = DAG.getBitcast(VT, Evens);
22285 Odds = DAG.getBitcast(VT, Odds);
22287 // Merge the two vectors back together with a shuffle. This expands into 2
22289 static const int ShufMask[] = { 0, 4, 2, 6 };
22290 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
22293 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
22294 "Only know how to lower V2I64/V4I64/V8I64 multiply");
22296 // 32-bit vector types used for MULDQ/MULUDQ.
22297 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22299 // MULDQ returns the 64-bit result of the signed multiplication of the lower
22300 // 32-bits. We can lower with this if the sign bits stretch that far.
22301 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
22302 DAG.ComputeNumSignBits(B) > 32) {
22303 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
22304 DAG.getBitcast(MulVT, B));
22307 // Ahi = psrlqi(a, 32);
22308 // Bhi = psrlqi(b, 32);
22310 // AloBlo = pmuludq(a, b);
22311 // AloBhi = pmuludq(a, Bhi);
22312 // AhiBlo = pmuludq(Ahi, b);
22314 // Hi = psllqi(AloBhi + AhiBlo, 32);
22315 // return AloBlo + Hi;
22316 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
22317 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
22318 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
22320 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
22321 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
22322 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
22324 // If DQI is supported we can use MULLQ, but MULUDQ is still better if the
22325 // the high bits are known to be zero.
22326 if (Subtarget.hasDQI() && (!AHiIsZero || !BHiIsZero))
22329 // Bit cast to 32-bit vectors for MULUDQ.
22330 SDValue Alo = DAG.getBitcast(MulVT, A);
22331 SDValue Blo = DAG.getBitcast(MulVT, B);
22333 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22335 // Only multiply lo/hi halves that aren't known to be zero.
22336 SDValue AloBlo = Zero;
22337 if (!ALoIsZero && !BLoIsZero)
22338 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
22340 SDValue AloBhi = Zero;
22341 if (!ALoIsZero && !BHiIsZero) {
22342 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
22343 Bhi = DAG.getBitcast(MulVT, Bhi);
22344 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
22347 SDValue AhiBlo = Zero;
22348 if (!AHiIsZero && !BLoIsZero) {
22349 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
22350 Ahi = DAG.getBitcast(MulVT, Ahi);
22351 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
22354 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
22355 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
22357 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
22360 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
22361 SelectionDAG &DAG) {
22363 MVT VT = Op.getSimpleValueType();
22365 // Decompose 256-bit ops into smaller 128-bit ops.
22366 if (VT.is256BitVector() && !Subtarget.hasInt256())
22367 return Lower256IntArith(Op, DAG);
22369 // Only i8 vectors should need custom lowering after this.
22370 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
22371 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
22372 "Unsupported vector type");
22374 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
22375 // logical shift down the upper half and pack back to i8.
22376 SDValue A = Op.getOperand(0);
22377 SDValue B = Op.getOperand(1);
22379 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
22380 // and then ashr/lshr the upper bits down to the lower bits before multiply.
22381 unsigned Opcode = Op.getOpcode();
22382 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
22383 unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
22385 // For 512-bit vectors, split into 256-bit vectors to allow the
22386 // sign-extension to occur.
22387 if (VT == MVT::v64i8)
22388 return Lower512IntArith(Op, DAG);
22390 // AVX2 implementations - extend xmm subvectors to ymm.
22391 if (Subtarget.hasInt256()) {
22392 unsigned NumElems = VT.getVectorNumElements();
22393 SDValue Lo = DAG.getIntPtrConstant(0, dl);
22394 SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
22396 if (VT == MVT::v32i8) {
22397 if (Subtarget.hasBWI()) {
22398 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
22399 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
22400 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
22401 Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
22402 DAG.getConstant(8, dl, MVT::v32i16));
22403 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22405 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
22406 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
22407 SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
22408 SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
22409 ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
22410 BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
22411 AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
22412 BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
22413 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22414 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
22415 DAG.getConstant(8, dl, MVT::v16i16));
22416 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22417 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
22418 DAG.getConstant(8, dl, MVT::v16i16));
22419 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
22420 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
22421 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
22422 16, 17, 18, 19, 20, 21, 22, 23};
22423 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22424 24, 25, 26, 27, 28, 29, 30, 31};
22425 return DAG.getNode(X86ISD::PACKUS, dl, VT,
22426 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
22427 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
22430 assert(VT == MVT::v16i8 && "Unexpected VT");
22432 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
22433 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
22434 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
22435 Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
22436 DAG.getConstant(8, dl, MVT::v16i16));
22437 // If we have BWI we can use truncate instruction.
22438 if (Subtarget.hasBWI())
22439 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22440 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
22441 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
22442 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22445 assert(VT == MVT::v16i8 &&
22446 "Pre-AVX2 support only supports v16i8 multiplication");
22447 MVT ExVT = MVT::v8i16;
22448 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
22450 // Extract the lo parts and zero/sign extend to i16.
22452 if (Subtarget.hasSSE41()) {
22453 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
22454 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
22456 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22457 -1, 4, -1, 5, -1, 6, -1, 7};
22458 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22459 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22460 ALo = DAG.getBitcast(ExVT, ALo);
22461 BLo = DAG.getBitcast(ExVT, BLo);
22462 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22463 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22466 // Extract the hi parts and zero/sign extend to i16.
22468 if (Subtarget.hasSSE41()) {
22469 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22470 -1, -1, -1, -1, -1, -1, -1, -1};
22471 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22472 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22473 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
22474 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
22476 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22477 -1, 12, -1, 13, -1, 14, -1, 15};
22478 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22479 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22480 AHi = DAG.getBitcast(ExVT, AHi);
22481 BHi = DAG.getBitcast(ExVT, BHi);
22482 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22483 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22486 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
22487 // pack back to v16i8.
22488 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22489 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22490 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
22491 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
22492 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22495 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
22496 assert(Subtarget.isTargetWin64() && "Unexpected target");
22497 EVT VT = Op.getValueType();
22498 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
22499 "Unexpected return type for lowering");
22503 switch (Op->getOpcode()) {
22504 default: llvm_unreachable("Unexpected request for libcall!");
22505 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
22506 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
22507 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
22508 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
22509 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
22510 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
22514 SDValue InChain = DAG.getEntryNode();
22516 TargetLowering::ArgListTy Args;
22517 TargetLowering::ArgListEntry Entry;
22518 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
22519 EVT ArgVT = Op->getOperand(i).getValueType();
22520 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
22521 "Unexpected argument type for lowering");
22522 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
22523 Entry.Node = StackPtr;
22524 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
22525 MachinePointerInfo(), /* Alignment = */ 16);
22526 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22527 Entry.Ty = PointerType::get(ArgTy,0);
22528 Entry.IsSExt = false;
22529 Entry.IsZExt = false;
22530 Args.push_back(Entry);
22533 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
22534 getPointerTy(DAG.getDataLayout()));
22536 TargetLowering::CallLoweringInfo CLI(DAG);
22537 CLI.setDebugLoc(dl)
22540 getLibcallCallingConv(LC),
22541 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
22544 .setSExtResult(isSigned)
22545 .setZExtResult(!isSigned);
22547 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
22548 return DAG.getBitcast(VT, CallInfo.first);
22551 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
22552 SelectionDAG &DAG) {
22553 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
22554 MVT VT = Op0.getSimpleValueType();
22557 // Decompose 256-bit ops into smaller 128-bit ops.
22558 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22559 unsigned Opcode = Op.getOpcode();
22560 unsigned NumElems = VT.getVectorNumElements();
22561 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
22562 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
22563 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
22564 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
22565 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
22566 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
22567 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
22569 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
22570 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
22572 return DAG.getMergeValues(Ops, dl);
22575 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
22576 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
22577 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
22579 int NumElts = VT.getVectorNumElements();
22581 // PMULxD operations multiply each even value (starting at 0) of LHS with
22582 // the related value of RHS and produce a widen result.
22583 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22584 // => <2 x i64> <ae|cg>
22586 // In other word, to have all the results, we need to perform two PMULxD:
22587 // 1. one with the even values.
22588 // 2. one with the odd values.
22589 // To achieve #2, with need to place the odd values at an even position.
22591 // Place the odd value at an even position (basically, shift all values 1
22592 // step to the left):
22593 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
22594 // <a|b|c|d> => <b|undef|d|undef>
22595 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
22596 makeArrayRef(&Mask[0], NumElts));
22597 // <e|f|g|h> => <f|undef|h|undef>
22598 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
22599 makeArrayRef(&Mask[0], NumElts));
22601 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
22603 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
22604 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
22606 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
22607 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22608 // => <2 x i64> <ae|cg>
22609 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
22610 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
22611 // => <2 x i64> <bf|dh>
22612 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
22614 // Shuffle it back into the right order.
22615 SmallVector<int, 16> HighMask(NumElts);
22616 SmallVector<int, 16> LowMask(NumElts);
22617 for (int i = 0; i != NumElts; ++i) {
22618 HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
22619 LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
22622 SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
22623 SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
22625 // If we have a signed multiply but no PMULDQ fix up the high parts of a
22626 // unsigned multiply.
22627 if (IsSigned && !Subtarget.hasSSE41()) {
22628 SDValue ShAmt = DAG.getConstant(
22630 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
22631 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
22632 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
22633 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
22634 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
22636 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
22637 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
22640 // The first result of MUL_LOHI is actually the low value, followed by the
22642 SDValue Ops[] = {Lows, Highs};
22643 return DAG.getMergeValues(Ops, dl);
22646 // Return true if the required (according to Opcode) shift-imm form is natively
22647 // supported by the Subtarget
22648 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
22650 if (VT.getScalarSizeInBits() < 16)
22653 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
22654 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
22657 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
22658 (VT.is256BitVector() && Subtarget.hasInt256());
22660 bool AShift = LShift && (Subtarget.hasAVX512() ||
22661 (VT != MVT::v2i64 && VT != MVT::v4i64));
22662 return (Opcode == ISD::SRA) ? AShift : LShift;
22665 // The shift amount is a variable, but it is the same for all vector lanes.
22666 // These instructions are defined together with shift-immediate.
22668 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
22670 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
22673 // Return true if the required (according to Opcode) variable-shift form is
22674 // natively supported by the Subtarget
22675 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
22678 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
22681 // vXi16 supported only on AVX-512, BWI
22682 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
22685 if (Subtarget.hasAVX512())
22688 bool LShift = VT.is128BitVector() || VT.is256BitVector();
22689 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
22690 return (Opcode == ISD::SRA) ? AShift : LShift;
22693 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
22694 const X86Subtarget &Subtarget) {
22695 MVT VT = Op.getSimpleValueType();
22697 SDValue R = Op.getOperand(0);
22698 SDValue Amt = Op.getOperand(1);
22700 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22701 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22703 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
22704 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
22705 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
22706 SDValue Ex = DAG.getBitcast(ExVT, R);
22708 // ashr(R, 63) === cmp_slt(R, 0)
22709 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
22710 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
22711 "Unsupported PCMPGT op");
22712 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
22713 getZeroVector(VT, Subtarget, DAG, dl), R);
22716 if (ShiftAmt >= 32) {
22717 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
22719 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
22720 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22721 ShiftAmt - 32, DAG);
22722 if (VT == MVT::v2i64)
22723 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
22724 if (VT == MVT::v4i64)
22725 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22726 {9, 1, 11, 3, 13, 5, 15, 7});
22728 // SRA upper i32, SHL whole i64 and select lower i32.
22729 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22732 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
22733 Lower = DAG.getBitcast(ExVT, Lower);
22734 if (VT == MVT::v2i64)
22735 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
22736 if (VT == MVT::v4i64)
22737 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22738 {8, 1, 10, 3, 12, 5, 14, 7});
22740 return DAG.getBitcast(VT, Ex);
22743 // Optimize shl/srl/sra with constant shift amount.
22744 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22745 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
22746 uint64_t ShiftAmt = ShiftConst->getZExtValue();
22748 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22749 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22751 // i64 SRA needs to be performed as partial shifts.
22752 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
22753 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
22754 Op.getOpcode() == ISD::SRA)
22755 return ArithmeticShiftRight64(ShiftAmt);
22757 if (VT == MVT::v16i8 ||
22758 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
22759 VT == MVT::v64i8) {
22760 unsigned NumElts = VT.getVectorNumElements();
22761 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
22763 // Simple i8 add case
22764 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
22765 return DAG.getNode(ISD::ADD, dl, VT, R, R);
22767 // ashr(R, 7) === cmp_slt(R, 0)
22768 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
22769 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22770 if (VT.is512BitVector()) {
22771 assert(VT == MVT::v64i8 && "Unexpected element type!");
22772 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
22773 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22775 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
22778 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
22779 if (VT == MVT::v16i8 && Subtarget.hasXOP())
22782 if (Op.getOpcode() == ISD::SHL) {
22783 // Make a large shift.
22784 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22786 SHL = DAG.getBitcast(VT, SHL);
22787 // Zero out the rightmost bits.
22788 return DAG.getNode(ISD::AND, dl, VT, SHL,
22789 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22791 if (Op.getOpcode() == ISD::SRL) {
22792 // Make a large shift.
22793 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22795 SRL = DAG.getBitcast(VT, SRL);
22796 // Zero out the leftmost bits.
22797 return DAG.getNode(ISD::AND, dl, VT, SRL,
22798 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22800 if (Op.getOpcode() == ISD::SRA) {
22801 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22802 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22804 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22805 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22806 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22809 llvm_unreachable("Unknown shift opcode.");
22814 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
22815 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22816 if (!Subtarget.hasXOP() &&
22817 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
22818 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
22820 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22821 unsigned SubVectorScale = 1;
22822 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22824 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22825 Amt = Amt.getOperand(0);
22828 // Peek through any splat that was introduced for i64 shift vectorization.
22829 int SplatIndex = -1;
22830 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22831 if (SVN->isSplat()) {
22832 SplatIndex = SVN->getSplatIndex();
22833 Amt = Amt.getOperand(0);
22834 assert(SplatIndex < (int)VT.getVectorNumElements() &&
22835 "Splat shuffle referencing second operand");
22838 if (Amt.getOpcode() != ISD::BITCAST ||
22839 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22842 Amt = Amt.getOperand(0);
22843 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22844 (SubVectorScale * VT.getVectorNumElements());
22845 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22846 uint64_t ShiftAmt = 0;
22847 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
22848 for (unsigned i = 0; i != Ratio; ++i) {
22849 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22853 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22856 // Check remaining shift amounts (if not a splat).
22857 if (SplatIndex < 0) {
22858 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22859 uint64_t ShAmt = 0;
22860 for (unsigned j = 0; j != Ratio; ++j) {
22861 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22865 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22867 if (ShAmt != ShiftAmt)
22872 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22873 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22875 if (Op.getOpcode() == ISD::SRA)
22876 return ArithmeticShiftRight64(ShiftAmt);
22882 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22883 const X86Subtarget &Subtarget) {
22884 MVT VT = Op.getSimpleValueType();
22886 SDValue R = Op.getOperand(0);
22887 SDValue Amt = Op.getOperand(1);
22889 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22890 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22892 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22893 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
22895 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
22897 MVT EltVT = VT.getVectorElementType();
22899 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22900 // Check if this build_vector node is doing a splat.
22901 // If so, then set BaseShAmt equal to the splat value.
22902 BaseShAmt = BV->getSplatValue();
22903 if (BaseShAmt && BaseShAmt.isUndef())
22904 BaseShAmt = SDValue();
22906 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22907 Amt = Amt.getOperand(0);
22909 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22910 if (SVN && SVN->isSplat()) {
22911 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22912 SDValue InVec = Amt.getOperand(0);
22913 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22914 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22915 "Unexpected shuffle index found!");
22916 BaseShAmt = InVec.getOperand(SplatIdx);
22917 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22918 if (ConstantSDNode *C =
22919 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22920 if (C->getZExtValue() == SplatIdx)
22921 BaseShAmt = InVec.getOperand(1);
22926 // Avoid introducing an extract element from a shuffle.
22927 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22928 DAG.getIntPtrConstant(SplatIdx, dl));
22932 if (BaseShAmt.getNode()) {
22933 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22934 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22935 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22936 else if (EltVT.bitsLT(MVT::i32))
22937 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22939 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22943 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
22944 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
22945 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22946 Amt = Amt.getOperand(0);
22947 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22948 VT.getVectorNumElements();
22949 std::vector<SDValue> Vals(Ratio);
22950 for (unsigned i = 0; i != Ratio; ++i)
22951 Vals[i] = Amt.getOperand(i);
22952 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22953 for (unsigned j = 0; j != Ratio; ++j)
22954 if (Vals[j] != Amt.getOperand(i + j))
22958 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22959 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22964 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22965 SelectionDAG &DAG) {
22966 MVT VT = Op.getSimpleValueType();
22968 SDValue R = Op.getOperand(0);
22969 SDValue Amt = Op.getOperand(1);
22970 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22972 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22973 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22975 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22978 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22981 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22984 // XOP has 128-bit variable logical/arithmetic shifts.
22985 // +ve/-ve Amt = shift left/right.
22986 if (Subtarget.hasXOP() &&
22987 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22988 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22989 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22990 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22991 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22993 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22994 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22995 if (Op.getOpcode() == ISD::SRA)
22996 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22999 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
23000 // shifts per-lane and then shuffle the partial results back together.
23001 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
23002 // Splat the shift amounts so the scalar shifts above will catch it.
23003 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
23004 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
23005 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
23006 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
23007 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
23010 // i64 vector arithmetic shift can be emulated with the transform:
23011 // M = lshr(SIGN_MASK, Amt)
23012 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
23013 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
23014 Op.getOpcode() == ISD::SRA) {
23015 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
23016 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
23017 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23018 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
23019 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
23023 // If possible, lower this packed shift into a vector multiply instead of
23024 // expanding it into a sequence of scalar shifts.
23025 // Do this only if the vector shift count is a constant build_vector.
23026 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
23027 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
23028 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
23029 SmallVector<SDValue, 8> Elts;
23030 MVT SVT = VT.getVectorElementType();
23031 unsigned SVTBits = SVT.getSizeInBits();
23032 APInt One(SVTBits, 1);
23033 unsigned NumElems = VT.getVectorNumElements();
23035 for (unsigned i=0; i !=NumElems; ++i) {
23036 SDValue Op = Amt->getOperand(i);
23037 if (Op->isUndef()) {
23038 Elts.push_back(Op);
23042 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
23043 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
23044 uint64_t ShAmt = C.getZExtValue();
23045 if (ShAmt >= SVTBits) {
23046 Elts.push_back(DAG.getUNDEF(SVT));
23049 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
23051 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
23052 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
23055 // Lower SHL with variable shift amount.
23056 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
23057 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
23059 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
23060 DAG.getConstant(0x3f800000U, dl, VT));
23061 Op = DAG.getBitcast(MVT::v4f32, Op);
23062 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
23063 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
23066 // If possible, lower this shift as a sequence of two shifts by
23067 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
23069 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
23071 // Could be rewritten as:
23072 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
23074 // The advantage is that the two shifts from the example would be
23075 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
23076 // the vector shift into four scalar shifts plus four pairs of vector
23078 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
23079 bool UseMOVSD = false;
23080 bool CanBeSimplified;
23081 // The splat value for the first packed shift (the 'X' from the example).
23082 SDValue Amt1 = Amt->getOperand(0);
23083 // The splat value for the second packed shift (the 'Y' from the example).
23084 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
23086 // See if it is possible to replace this node with a sequence of
23087 // two shifts followed by a MOVSS/MOVSD/PBLEND.
23088 if (VT == MVT::v4i32) {
23089 // Check if it is legal to use a MOVSS.
23090 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
23091 Amt2 == Amt->getOperand(3);
23092 if (!CanBeSimplified) {
23093 // Otherwise, check if we can still simplify this node using a MOVSD.
23094 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
23095 Amt->getOperand(2) == Amt->getOperand(3);
23097 Amt2 = Amt->getOperand(2);
23100 // Do similar checks for the case where the machine value type
23102 CanBeSimplified = Amt1 == Amt->getOperand(1);
23103 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
23104 CanBeSimplified = Amt2 == Amt->getOperand(i);
23106 if (!CanBeSimplified) {
23108 CanBeSimplified = true;
23109 Amt2 = Amt->getOperand(4);
23110 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
23111 CanBeSimplified = Amt1 == Amt->getOperand(i);
23112 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
23113 CanBeSimplified = Amt2 == Amt->getOperand(j);
23117 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
23118 isa<ConstantSDNode>(Amt2)) {
23119 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
23121 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
23122 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
23124 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
23125 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
23126 SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
23127 SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
23129 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
23130 BitCast2, {0, 1, 6, 7}));
23131 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
23132 BitCast2, {0, 5, 6, 7}));
23136 // v4i32 Non Uniform Shifts.
23137 // If the shift amount is constant we can shift each lane using the SSE2
23138 // immediate shifts, else we need to zero-extend each lane to the lower i64
23139 // and shift using the SSE2 variable shifts.
23140 // The separate results can then be blended together.
23141 if (VT == MVT::v4i32) {
23142 unsigned Opc = Op.getOpcode();
23143 SDValue Amt0, Amt1, Amt2, Amt3;
23145 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
23146 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
23147 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
23148 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
23150 // ISD::SHL is handled above but we include it here for completeness.
23153 llvm_unreachable("Unknown target vector shift node");
23155 Opc = X86ISD::VSHL;
23158 Opc = X86ISD::VSRL;
23161 Opc = X86ISD::VSRA;
23164 // The SSE2 shifts use the lower i64 as the same shift amount for
23165 // all lanes and the upper i64 is ignored. These shuffle masks
23166 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
23167 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23168 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
23169 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
23170 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
23171 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
23174 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
23175 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
23176 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
23177 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
23178 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
23179 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
23180 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
23183 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
23184 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
23185 // make the existing SSE solution better.
23186 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
23187 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
23188 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
23189 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
23190 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
23191 "Unexpected vector type");
23192 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
23193 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
23195 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23196 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
23197 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
23198 return DAG.getNode(ISD::TRUNCATE, dl, VT,
23199 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
23202 if (VT == MVT::v16i8 ||
23203 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
23204 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
23205 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23206 unsigned ShiftOpcode = Op->getOpcode();
23208 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23209 if (VT.is512BitVector()) {
23210 // On AVX512BW targets we make use of the fact that VSELECT lowers
23211 // to a masked blend which selects bytes based just on the sign bit
23212 // extracted to a mask.
23213 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
23214 V0 = DAG.getBitcast(VT, V0);
23215 V1 = DAG.getBitcast(VT, V1);
23216 Sel = DAG.getBitcast(VT, Sel);
23217 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
23218 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23219 } else if (Subtarget.hasSSE41()) {
23220 // On SSE41 targets we make use of the fact that VSELECT lowers
23221 // to PBLENDVB which selects bytes based just on the sign bit.
23222 V0 = DAG.getBitcast(VT, V0);
23223 V1 = DAG.getBitcast(VT, V1);
23224 Sel = DAG.getBitcast(VT, Sel);
23225 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23227 // On pre-SSE41 targets we test for the sign bit by comparing to
23228 // zero - a negative value will set all bits of the lanes to true
23229 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23230 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
23231 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
23232 return DAG.getSelect(dl, SelVT, C, V0, V1);
23235 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23236 // We can safely do this using i16 shifts as we're only interested in
23237 // the 3 lower bits of each byte.
23238 Amt = DAG.getBitcast(ExtVT, Amt);
23239 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
23240 Amt = DAG.getBitcast(VT, Amt);
23242 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
23243 // r = VSELECT(r, shift(r, 4), a);
23245 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23246 R = SignBitSelect(VT, Amt, M, R);
23249 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23251 // r = VSELECT(r, shift(r, 2), a);
23252 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23253 R = SignBitSelect(VT, Amt, M, R);
23256 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23258 // return VSELECT(r, shift(r, 1), a);
23259 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23260 R = SignBitSelect(VT, Amt, M, R);
23264 if (Op->getOpcode() == ISD::SRA) {
23265 // For SRA we need to unpack each byte to the higher byte of a i16 vector
23266 // so we can correctly sign extend. We don't care what happens to the
23268 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
23269 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
23270 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
23271 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
23272 ALo = DAG.getBitcast(ExtVT, ALo);
23273 AHi = DAG.getBitcast(ExtVT, AHi);
23274 RLo = DAG.getBitcast(ExtVT, RLo);
23275 RHi = DAG.getBitcast(ExtVT, RHi);
23277 // r = VSELECT(r, shift(r, 4), a);
23278 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23279 DAG.getConstant(4, dl, ExtVT));
23280 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23281 DAG.getConstant(4, dl, ExtVT));
23282 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23283 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23286 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23287 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23289 // r = VSELECT(r, shift(r, 2), a);
23290 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23291 DAG.getConstant(2, dl, ExtVT));
23292 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23293 DAG.getConstant(2, dl, ExtVT));
23294 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23295 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23298 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23299 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23301 // r = VSELECT(r, shift(r, 1), a);
23302 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23303 DAG.getConstant(1, dl, ExtVT));
23304 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23305 DAG.getConstant(1, dl, ExtVT));
23306 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23307 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23309 // Logical shift the result back to the lower byte, leaving a zero upper
23311 // meaning that we can safely pack with PACKUSWB.
23313 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
23315 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
23316 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23320 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
23321 MVT ExtVT = MVT::v8i32;
23322 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23323 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
23324 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
23325 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
23326 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
23327 ALo = DAG.getBitcast(ExtVT, ALo);
23328 AHi = DAG.getBitcast(ExtVT, AHi);
23329 RLo = DAG.getBitcast(ExtVT, RLo);
23330 RHi = DAG.getBitcast(ExtVT, RHi);
23331 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
23332 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
23333 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
23334 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
23335 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23338 if (VT == MVT::v8i16) {
23339 unsigned ShiftOpcode = Op->getOpcode();
23341 // If we have a constant shift amount, the non-SSE41 path is best as
23342 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
23343 bool UseSSE41 = Subtarget.hasSSE41() &&
23344 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23346 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
23347 // On SSE41 targets we make use of the fact that VSELECT lowers
23348 // to PBLENDVB which selects bytes based just on the sign bit.
23350 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
23351 V0 = DAG.getBitcast(ExtVT, V0);
23352 V1 = DAG.getBitcast(ExtVT, V1);
23353 Sel = DAG.getBitcast(ExtVT, Sel);
23354 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
23356 // On pre-SSE41 targets we splat the sign bit - a negative value will
23357 // set all bits of the lanes to true and VSELECT uses that in
23358 // its OR(AND(V0,C),AND(V1,~C)) lowering.
23360 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
23361 return DAG.getSelect(dl, VT, C, V0, V1);
23364 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
23366 // On SSE41 targets we need to replicate the shift mask in both
23367 // bytes for PBLENDVB.
23370 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
23371 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
23373 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
23376 // r = VSELECT(r, shift(r, 8), a);
23377 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
23378 R = SignBitSelect(Amt, M, R);
23381 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23383 // r = VSELECT(r, shift(r, 4), a);
23384 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23385 R = SignBitSelect(Amt, M, R);
23388 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23390 // r = VSELECT(r, shift(r, 2), a);
23391 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23392 R = SignBitSelect(Amt, M, R);
23395 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23397 // return VSELECT(r, shift(r, 1), a);
23398 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23399 R = SignBitSelect(Amt, M, R);
23403 // Decompose 256-bit shifts into smaller 128-bit shifts.
23404 if (VT.is256BitVector())
23405 return Lower256IntArith(Op, DAG);
23410 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
23411 SelectionDAG &DAG) {
23412 MVT VT = Op.getSimpleValueType();
23414 SDValue R = Op.getOperand(0);
23415 SDValue Amt = Op.getOperand(1);
23416 unsigned Opcode = Op.getOpcode();
23417 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23419 if (Subtarget.hasAVX512()) {
23420 // Attempt to rotate by immediate.
23422 SmallVector<APInt, 16> EltBits;
23423 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
23424 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
23425 return EltBits[0] == V;
23427 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
23428 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
23429 return DAG.getNode(Op, DL, VT, R,
23430 DAG.getConstant(RotateAmt, DL, MVT::i8));
23434 // Else, fall-back on VPROLV/VPRORV.
23438 assert(VT.isVector() && "Custom lowering only for vector rotates!");
23439 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
23440 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
23442 // XOP has 128-bit vector variable + immediate rotates.
23443 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
23445 // Split 256-bit integers.
23446 if (VT.is256BitVector())
23447 return Lower256IntArith(Op, DAG);
23449 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
23451 // Attempt to rotate by immediate.
23452 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23453 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23454 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23455 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23456 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
23457 DAG.getConstant(RotateAmt, DL, MVT::i8));
23461 // Use general rotate by variable (per-element).
23465 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23466 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23467 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23468 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23469 // has only one use.
23470 SDNode *N = Op.getNode();
23471 SDValue LHS = N->getOperand(0);
23472 SDValue RHS = N->getOperand(1);
23473 unsigned BaseOp = 0;
23474 X86::CondCode Cond;
23476 switch (Op.getOpcode()) {
23477 default: llvm_unreachable("Unknown ovf instruction!");
23479 // A subtract of one will be selected as a INC. Note that INC doesn't
23480 // set CF, so we can't do this for UADDO.
23481 if (isOneConstant(RHS)) {
23482 BaseOp = X86ISD::INC;
23483 Cond = X86::COND_O;
23486 BaseOp = X86ISD::ADD;
23487 Cond = X86::COND_O;
23490 BaseOp = X86ISD::ADD;
23491 Cond = X86::COND_B;
23494 // A subtract of one will be selected as a DEC. Note that DEC doesn't
23495 // set CF, so we can't do this for USUBO.
23496 if (isOneConstant(RHS)) {
23497 BaseOp = X86ISD::DEC;
23498 Cond = X86::COND_O;
23501 BaseOp = X86ISD::SUB;
23502 Cond = X86::COND_O;
23505 BaseOp = X86ISD::SUB;
23506 Cond = X86::COND_B;
23509 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
23510 Cond = X86::COND_O;
23512 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
23513 if (N->getValueType(0) == MVT::i8) {
23514 BaseOp = X86ISD::UMUL8;
23515 Cond = X86::COND_O;
23518 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
23520 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
23522 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
23524 if (N->getValueType(1) == MVT::i1)
23525 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23527 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23531 // Also sets EFLAGS.
23532 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
23533 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23535 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
23537 if (N->getValueType(1) == MVT::i1)
23538 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23540 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23543 /// Returns true if the operand type is exactly twice the native width, and
23544 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
23545 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
23546 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
23547 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
23548 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
23551 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
23552 else if (OpWidth == 128)
23553 return Subtarget.hasCmpxchg16b();
23558 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
23559 return needsCmpXchgNb(SI->getValueOperand()->getType());
23562 // Note: this turns large loads into lock cmpxchg8b/16b.
23563 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
23564 TargetLowering::AtomicExpansionKind
23565 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
23566 auto PTy = cast<PointerType>(LI->getPointerOperandType());
23567 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
23568 : AtomicExpansionKind::None;
23571 TargetLowering::AtomicExpansionKind
23572 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
23573 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23574 Type *MemType = AI->getType();
23576 // If the operand is too big, we must see if cmpxchg8/16b is available
23577 // and default to library calls otherwise.
23578 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
23579 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
23580 : AtomicExpansionKind::None;
23583 AtomicRMWInst::BinOp Op = AI->getOperation();
23586 llvm_unreachable("Unknown atomic operation");
23587 case AtomicRMWInst::Xchg:
23588 case AtomicRMWInst::Add:
23589 case AtomicRMWInst::Sub:
23590 // It's better to use xadd, xsub or xchg for these in all cases.
23591 return AtomicExpansionKind::None;
23592 case AtomicRMWInst::Or:
23593 case AtomicRMWInst::And:
23594 case AtomicRMWInst::Xor:
23595 // If the atomicrmw's result isn't actually used, we can just add a "lock"
23596 // prefix to a normal instruction for these operations.
23597 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
23598 : AtomicExpansionKind::None;
23599 case AtomicRMWInst::Nand:
23600 case AtomicRMWInst::Max:
23601 case AtomicRMWInst::Min:
23602 case AtomicRMWInst::UMax:
23603 case AtomicRMWInst::UMin:
23604 // These always require a non-trivial set of data operations on x86. We must
23605 // use a cmpxchg loop.
23606 return AtomicExpansionKind::CmpXChg;
23611 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
23612 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23613 Type *MemType = AI->getType();
23614 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
23615 // there is no benefit in turning such RMWs into loads, and it is actually
23616 // harmful as it introduces a mfence.
23617 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
23620 auto Builder = IRBuilder<>(AI);
23621 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
23622 auto SSID = AI->getSyncScopeID();
23623 // We must restrict the ordering to avoid generating loads with Release or
23624 // ReleaseAcquire orderings.
23625 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
23626 auto Ptr = AI->getPointerOperand();
23628 // Before the load we need a fence. Here is an example lifted from
23629 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
23632 // x.store(1, relaxed);
23633 // r1 = y.fetch_add(0, release);
23635 // y.fetch_add(42, acquire);
23636 // r2 = x.load(relaxed);
23637 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
23638 // lowered to just a load without a fence. A mfence flushes the store buffer,
23639 // making the optimization clearly correct.
23640 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
23641 // otherwise, we might be able to be more aggressive on relaxed idempotent
23642 // rmw. In practice, they do not look useful, so we don't try to be
23643 // especially clever.
23644 if (SSID == SyncScope::SingleThread)
23645 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
23646 // the IR level, so we must wrap it in an intrinsic.
23649 if (!Subtarget.hasMFence())
23650 // FIXME: it might make sense to use a locked operation here but on a
23651 // different cache-line to prevent cache-line bouncing. In practice it
23652 // is probably a small win, and x86 processors without mfence are rare
23653 // enough that we do not bother.
23657 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
23658 Builder.CreateCall(MFence, {});
23660 // Finally we can emit the atomic load.
23661 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
23662 AI->getType()->getPrimitiveSizeInBits());
23663 Loaded->setAtomic(Order, SSID);
23664 AI->replaceAllUsesWith(Loaded);
23665 AI->eraseFromParent();
23669 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
23670 SelectionDAG &DAG) {
23672 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
23673 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
23674 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
23675 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
23677 // The only fence that needs an instruction is a sequentially-consistent
23678 // cross-thread fence.
23679 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
23680 FenceSSID == SyncScope::System) {
23681 if (Subtarget.hasMFence())
23682 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
23684 SDValue Chain = Op.getOperand(0);
23685 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
23687 DAG.getRegister(X86::ESP, MVT::i32), // Base
23688 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
23689 DAG.getRegister(0, MVT::i32), // Index
23690 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
23691 DAG.getRegister(0, MVT::i32), // Segment.
23695 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
23696 return SDValue(Res, 0);
23699 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
23700 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
23703 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
23704 SelectionDAG &DAG) {
23705 MVT T = Op.getSimpleValueType();
23709 switch(T.SimpleTy) {
23710 default: llvm_unreachable("Invalid value type!");
23711 case MVT::i8: Reg = X86::AL; size = 1; break;
23712 case MVT::i16: Reg = X86::AX; size = 2; break;
23713 case MVT::i32: Reg = X86::EAX; size = 4; break;
23715 assert(Subtarget.is64Bit() && "Node not type legal!");
23716 Reg = X86::RAX; size = 8;
23719 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
23720 Op.getOperand(2), SDValue());
23721 SDValue Ops[] = { cpIn.getValue(0),
23724 DAG.getTargetConstant(size, DL, MVT::i8),
23725 cpIn.getValue(1) };
23726 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23727 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
23728 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
23732 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
23733 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
23734 MVT::i32, cpOut.getValue(2));
23735 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
23737 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
23738 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
23739 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
23743 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
23744 SelectionDAG &DAG) {
23745 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
23746 MVT DstVT = Op.getSimpleValueType();
23748 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
23749 SrcVT == MVT::i64) {
23750 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23751 if (DstVT != MVT::f64)
23752 // This conversion needs to be expanded.
23755 SDValue Op0 = Op->getOperand(0);
23756 SmallVector<SDValue, 16> Elts;
23760 if (SrcVT.isVector()) {
23761 NumElts = SrcVT.getVectorNumElements();
23762 SVT = SrcVT.getVectorElementType();
23764 // Widen the vector in input in the case of MVT::v2i32.
23765 // Example: from MVT::v2i32 to MVT::v4i32.
23766 for (unsigned i = 0, e = NumElts; i != e; ++i)
23767 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
23768 DAG.getIntPtrConstant(i, dl)));
23770 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
23771 "Unexpected source type in LowerBITCAST");
23772 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23773 DAG.getIntPtrConstant(0, dl)));
23774 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23775 DAG.getIntPtrConstant(1, dl)));
23779 // Explicitly mark the extra elements as Undef.
23780 Elts.append(NumElts, DAG.getUNDEF(SVT));
23782 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23783 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
23784 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
23785 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
23786 DAG.getIntPtrConstant(0, dl));
23789 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
23790 Subtarget.hasMMX() && "Unexpected custom BITCAST");
23791 assert((DstVT == MVT::i64 ||
23792 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
23793 "Unexpected custom BITCAST");
23794 // i64 <=> MMX conversions are Legal.
23795 if (SrcVT==MVT::i64 && DstVT.isVector())
23797 if (DstVT==MVT::i64 && SrcVT.isVector())
23799 // MMX <=> MMX conversions are Legal.
23800 if (SrcVT.isVector() && DstVT.isVector())
23802 // All other conversions need to be expanded.
23806 /// Compute the horizontal sum of bytes in V for the elements of VT.
23808 /// Requires V to be a byte vector and VT to be an integer vector type with
23809 /// wider elements than V's type. The width of the elements of VT determines
23810 /// how many bytes of V are summed horizontally to produce each element of the
23812 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23813 const X86Subtarget &Subtarget,
23814 SelectionDAG &DAG) {
23816 MVT ByteVecVT = V.getSimpleValueType();
23817 MVT EltVT = VT.getVectorElementType();
23818 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
23819 "Expected value to have byte element type.");
23820 assert(EltVT != MVT::i8 &&
23821 "Horizontal byte sum only makes sense for wider elements!");
23822 unsigned VecSize = VT.getSizeInBits();
23823 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
23825 // PSADBW instruction horizontally add all bytes and leave the result in i64
23826 // chunks, thus directly computes the pop count for v2i64 and v4i64.
23827 if (EltVT == MVT::i64) {
23828 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23829 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23830 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23831 return DAG.getBitcast(VT, V);
23834 if (EltVT == MVT::i32) {
23835 // We unpack the low half and high half into i32s interleaved with zeros so
23836 // that we can use PSADBW to horizontally sum them. The most useful part of
23837 // this is that it lines up the results of two PSADBW instructions to be
23838 // two v2i64 vectors which concatenated are the 4 population counts. We can
23839 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23840 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23841 SDValue V32 = DAG.getBitcast(VT, V);
23842 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23843 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23845 // Do the horizontal sums into two v2i64s.
23846 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23847 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23848 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23849 DAG.getBitcast(ByteVecVT, Low), Zeros);
23850 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23851 DAG.getBitcast(ByteVecVT, High), Zeros);
23853 // Merge them together.
23854 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23855 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23856 DAG.getBitcast(ShortVecVT, Low),
23857 DAG.getBitcast(ShortVecVT, High));
23859 return DAG.getBitcast(VT, V);
23862 // The only element type left is i16.
23863 assert(EltVT == MVT::i16 && "Unknown how to handle type");
23865 // To obtain pop count for each i16 element starting from the pop count for
23866 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23867 // right by 8. It is important to shift as i16s as i8 vector shift isn't
23868 // directly supported.
23869 SDValue ShifterV = DAG.getConstant(8, DL, VT);
23870 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23871 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23872 DAG.getBitcast(ByteVecVT, V));
23873 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23876 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23877 const X86Subtarget &Subtarget,
23878 SelectionDAG &DAG) {
23879 MVT VT = Op.getSimpleValueType();
23880 MVT EltVT = VT.getVectorElementType();
23881 unsigned VecSize = VT.getSizeInBits();
23883 // Implement a lookup table in register by using an algorithm based on:
23884 // http://wm.ite.pl/articles/sse-popcount.html
23886 // The general idea is that every lower byte nibble in the input vector is an
23887 // index into a in-register pre-computed pop count table. We then split up the
23888 // input vector in two new ones: (1) a vector with only the shifted-right
23889 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
23890 // masked out higher ones) for each byte. PSHUFB is used separately with both
23891 // to index the in-register table. Next, both are added and the result is a
23892 // i8 vector where each element contains the pop count for input byte.
23894 // To obtain the pop count for elements != i8, we follow up with the same
23895 // approach and use additional tricks as described below.
23897 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
23898 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
23899 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
23900 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
23902 int NumByteElts = VecSize / 8;
23903 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23904 SDValue In = DAG.getBitcast(ByteVecVT, Op);
23905 SmallVector<SDValue, 64> LUTVec;
23906 for (int i = 0; i < NumByteElts; ++i)
23907 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23908 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23909 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23912 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23913 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23916 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23918 // The input vector is used as the shuffle mask that index elements into the
23919 // LUT. After counting low and high nibbles, add the vector to obtain the
23920 // final pop count per i8 element.
23921 SDValue HighPopCnt =
23922 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23923 SDValue LowPopCnt =
23924 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23925 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23927 if (EltVT == MVT::i8)
23930 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23933 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23934 const X86Subtarget &Subtarget,
23935 SelectionDAG &DAG) {
23936 MVT VT = Op.getSimpleValueType();
23937 assert(VT.is128BitVector() &&
23938 "Only 128-bit vector bitmath lowering supported.");
23940 int VecSize = VT.getSizeInBits();
23941 MVT EltVT = VT.getVectorElementType();
23942 int Len = EltVT.getSizeInBits();
23944 // This is the vectorized version of the "best" algorithm from
23945 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23946 // with a minor tweak to use a series of adds + shifts instead of vector
23947 // multiplications. Implemented for all integer vector types. We only use
23948 // this when we don't have SSSE3 which allows a LUT-based lowering that is
23949 // much faster, even faster than using native popcnt instructions.
23951 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23952 MVT VT = V.getSimpleValueType();
23953 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23954 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23956 auto GetMask = [&](SDValue V, APInt Mask) {
23957 MVT VT = V.getSimpleValueType();
23958 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23959 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23962 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23963 // x86, so set the SRL type to have elements at least i16 wide. This is
23964 // correct because all of our SRLs are followed immediately by a mask anyways
23965 // that handles any bits that sneak into the high bits of the byte elements.
23966 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23970 // v = v - ((v >> 1) & 0x55555555...)
23972 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23973 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23974 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23976 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23977 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23978 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23979 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23980 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23982 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23983 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23984 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23985 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23987 // At this point, V contains the byte-wise population count, and we are
23988 // merely doing a horizontal sum if necessary to get the wider element
23990 if (EltVT == MVT::i8)
23993 return LowerHorizontalByteSum(
23994 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23998 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23999 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
24000 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24001 SelectionDAG &DAG) {
24002 MVT VT = Op.getSimpleValueType();
24003 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
24004 "Unknown CTPOP type to handle");
24005 SDLoc DL(Op.getNode());
24006 SDValue Op0 = Op.getOperand(0);
24008 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
24009 if (Subtarget.hasVPOPCNTDQ()) {
24010 unsigned NumElems = VT.getVectorNumElements();
24011 assert((VT.getVectorElementType() == MVT::i8 ||
24012 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
24013 if (NumElems <= 16) {
24014 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
24015 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
24016 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
24017 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
24021 if (!Subtarget.hasSSSE3()) {
24022 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
24023 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
24024 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
24027 // Decompose 256-bit ops into smaller 128-bit ops.
24028 if (VT.is256BitVector() && !Subtarget.hasInt256())
24029 return Lower256IntUnary(Op, DAG);
24031 // Decompose 512-bit ops into smaller 256-bit ops.
24032 if (VT.is512BitVector() && !Subtarget.hasBWI())
24033 return Lower512IntUnary(Op, DAG);
24035 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
24038 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24039 SelectionDAG &DAG) {
24040 assert(Op.getSimpleValueType().isVector() &&
24041 "We only do custom lowering for vector population count.");
24042 return LowerVectorCTPOP(Op, Subtarget, DAG);
24045 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
24046 MVT VT = Op.getSimpleValueType();
24047 SDValue In = Op.getOperand(0);
24050 // For scalars, its still beneficial to transfer to/from the SIMD unit to
24051 // perform the BITREVERSE.
24052 if (!VT.isVector()) {
24053 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
24054 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
24055 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
24056 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
24057 DAG.getIntPtrConstant(0, DL));
24060 int NumElts = VT.getVectorNumElements();
24061 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
24063 // Decompose 256-bit ops into smaller 128-bit ops.
24064 if (VT.is256BitVector())
24065 return Lower256IntUnary(Op, DAG);
24067 assert(VT.is128BitVector() &&
24068 "Only 128-bit vector bitreverse lowering supported.");
24070 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
24071 // perform the BSWAP in the shuffle.
24072 // Its best to shuffle using the second operand as this will implicitly allow
24073 // memory folding for multiple vectors.
24074 SmallVector<SDValue, 16> MaskElts;
24075 for (int i = 0; i != NumElts; ++i) {
24076 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
24077 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
24078 int PermuteByte = SourceByte | (2 << 5);
24079 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
24083 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
24084 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
24085 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
24087 return DAG.getBitcast(VT, Res);
24090 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
24091 SelectionDAG &DAG) {
24092 MVT VT = Op.getSimpleValueType();
24094 if (Subtarget.hasXOP() && !VT.is512BitVector())
24095 return LowerBITREVERSE_XOP(Op, DAG);
24097 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
24099 SDValue In = Op.getOperand(0);
24102 unsigned NumElts = VT.getVectorNumElements();
24103 assert(VT.getScalarType() == MVT::i8 &&
24104 "Only byte vector BITREVERSE supported");
24106 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
24107 if (VT.is256BitVector() && !Subtarget.hasInt256())
24108 return Lower256IntUnary(Op, DAG);
24110 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
24111 // two nibbles and a PSHUFB lookup to find the bitreverse of each
24112 // 0-15 value (moved to the other nibble).
24113 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
24114 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
24115 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
24117 const int LoLUT[16] = {
24118 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
24119 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
24120 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
24121 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
24122 const int HiLUT[16] = {
24123 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
24124 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
24125 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
24126 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
24128 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
24129 for (unsigned i = 0; i < NumElts; ++i) {
24130 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
24131 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
24134 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
24135 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
24136 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
24137 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
24138 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
24141 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
24142 const X86Subtarget &Subtarget,
24143 bool AllowIncDec = true) {
24144 unsigned NewOpc = 0;
24145 switch (N->getOpcode()) {
24146 case ISD::ATOMIC_LOAD_ADD:
24147 NewOpc = X86ISD::LADD;
24149 case ISD::ATOMIC_LOAD_SUB:
24150 NewOpc = X86ISD::LSUB;
24152 case ISD::ATOMIC_LOAD_OR:
24153 NewOpc = X86ISD::LOR;
24155 case ISD::ATOMIC_LOAD_XOR:
24156 NewOpc = X86ISD::LXOR;
24158 case ISD::ATOMIC_LOAD_AND:
24159 NewOpc = X86ISD::LAND;
24162 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
24165 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
24167 if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
24168 // Convert to inc/dec if they aren't slow or we are optimizing for size.
24169 if (AllowIncDec && (!Subtarget.slowIncDec() ||
24170 DAG.getMachineFunction().getFunction().optForSize())) {
24171 if ((NewOpc == X86ISD::LADD && C->isOne()) ||
24172 (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
24173 return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
24174 DAG.getVTList(MVT::i32, MVT::Other),
24175 {N->getOperand(0), N->getOperand(1)},
24176 /*MemVT=*/N->getSimpleValueType(0), MMO);
24177 if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
24178 (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
24179 return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
24180 DAG.getVTList(MVT::i32, MVT::Other),
24181 {N->getOperand(0), N->getOperand(1)},
24182 /*MemVT=*/N->getSimpleValueType(0), MMO);
24186 return DAG.getMemIntrinsicNode(
24187 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
24188 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
24189 /*MemVT=*/N->getSimpleValueType(0), MMO);
24192 /// Lower atomic_load_ops into LOCK-prefixed operations.
24193 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
24194 const X86Subtarget &Subtarget) {
24195 SDValue Chain = N->getOperand(0);
24196 SDValue LHS = N->getOperand(1);
24197 SDValue RHS = N->getOperand(2);
24198 unsigned Opc = N->getOpcode();
24199 MVT VT = N->getSimpleValueType(0);
24202 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
24203 // can only be lowered when the result is unused. They should have already
24204 // been transformed into a cmpxchg loop in AtomicExpand.
24205 if (N->hasAnyUseOfValue(0)) {
24206 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
24207 // select LXADD if LOCK_SUB can't be selected.
24208 if (Opc == ISD::ATOMIC_LOAD_SUB) {
24209 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
24210 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
24211 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
24212 RHS, AN->getMemOperand());
24214 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
24215 "Used AtomicRMW ops other than Add should have been expanded!");
24219 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
24220 // RAUW the chain, but don't worry about the result, as it's unused.
24221 assert(!N->hasAnyUseOfValue(0));
24222 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
24226 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
24227 SDNode *Node = Op.getNode();
24229 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
24231 // Convert seq_cst store -> xchg
24232 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
24233 // FIXME: On 32-bit, store -> fist or movq would be more efficient
24234 // (The only way to get a 16-byte store is cmpxchg16b)
24235 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
24236 if (cast<AtomicSDNode>(Node)->getOrdering() ==
24237 AtomicOrdering::SequentiallyConsistent ||
24238 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
24239 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
24240 cast<AtomicSDNode>(Node)->getMemoryVT(),
24241 Node->getOperand(0),
24242 Node->getOperand(1), Node->getOperand(2),
24243 cast<AtomicSDNode>(Node)->getMemOperand());
24244 return Swap.getValue(1);
24246 // Other atomic stores have a simple pattern.
24250 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
24251 SDNode *N = Op.getNode();
24252 MVT VT = N->getSimpleValueType(0);
24254 // Let legalize expand this if it isn't a legal type yet.
24255 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
24258 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
24261 // Set the carry flag.
24262 SDValue Carry = Op.getOperand(2);
24263 EVT CarryVT = Carry.getValueType();
24264 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
24265 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24266 Carry, DAG.getConstant(NegOne, DL, CarryVT));
24268 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
24269 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
24270 Op.getOperand(1), Carry.getValue(1));
24272 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
24273 if (N->getValueType(1) == MVT::i1)
24274 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
24276 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24279 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
24280 SelectionDAG &DAG) {
24281 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
24283 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
24284 // which returns the values as { float, float } (in XMM0) or
24285 // { double, double } (which is returned in XMM0, XMM1).
24287 SDValue Arg = Op.getOperand(0);
24288 EVT ArgVT = Arg.getValueType();
24289 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24291 TargetLowering::ArgListTy Args;
24292 TargetLowering::ArgListEntry Entry;
24296 Entry.IsSExt = false;
24297 Entry.IsZExt = false;
24298 Args.push_back(Entry);
24300 bool isF64 = ArgVT == MVT::f64;
24301 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
24302 // the small struct {f32, f32} is returned in (eax, edx). For f64,
24303 // the results are returned via SRet in memory.
24304 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24305 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
24306 const char *LibcallName = TLI.getLibcallName(LC);
24308 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
24310 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
24311 : (Type *)VectorType::get(ArgTy, 4);
24313 TargetLowering::CallLoweringInfo CLI(DAG);
24314 CLI.setDebugLoc(dl)
24315 .setChain(DAG.getEntryNode())
24316 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
24318 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
24321 // Returned in xmm0 and xmm1.
24322 return CallResult.first;
24324 // Returned in bits 0:31 and 32:64 xmm0.
24325 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24326 CallResult.first, DAG.getIntPtrConstant(0, dl));
24327 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24328 CallResult.first, DAG.getIntPtrConstant(1, dl));
24329 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
24330 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
24333 /// Widen a vector input to a vector of NVT. The
24334 /// input vector must have the same element type as NVT.
24335 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
24336 bool FillWithZeroes = false) {
24337 // Check if InOp already has the right width.
24338 MVT InVT = InOp.getSimpleValueType();
24342 if (InOp.isUndef())
24343 return DAG.getUNDEF(NVT);
24345 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
24346 "input and widen element type must match");
24348 unsigned InNumElts = InVT.getVectorNumElements();
24349 unsigned WidenNumElts = NVT.getVectorNumElements();
24350 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
24351 "Unexpected request for vector widening");
24354 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
24355 InOp.getNumOperands() == 2) {
24356 SDValue N1 = InOp.getOperand(1);
24357 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
24359 InOp = InOp.getOperand(0);
24360 InVT = InOp.getSimpleValueType();
24361 InNumElts = InVT.getVectorNumElements();
24364 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
24365 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
24366 SmallVector<SDValue, 16> Ops;
24367 for (unsigned i = 0; i < InNumElts; ++i)
24368 Ops.push_back(InOp.getOperand(i));
24370 EVT EltVT = InOp.getOperand(0).getValueType();
24372 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
24373 DAG.getUNDEF(EltVT);
24374 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
24375 Ops.push_back(FillVal);
24376 return DAG.getBuildVector(NVT, dl, Ops);
24378 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
24380 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
24381 InOp, DAG.getIntPtrConstant(0, dl));
24384 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
24385 SelectionDAG &DAG) {
24386 assert(Subtarget.hasAVX512() &&
24387 "MGATHER/MSCATTER are supported on AVX-512 arch only");
24389 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
24390 SDValue Src = N->getValue();
24391 MVT VT = Src.getSimpleValueType();
24392 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
24395 SDValue Index = N->getIndex();
24396 SDValue Mask = N->getMask();
24397 SDValue Chain = N->getChain();
24398 SDValue BasePtr = N->getBasePtr();
24399 MVT MemVT = N->getMemoryVT().getSimpleVT();
24400 MVT IndexVT = Index.getSimpleValueType();
24401 MVT MaskVT = Mask.getSimpleValueType();
24403 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
24404 // The v2i32 value was promoted to v2i64.
24405 // Now we "redo" the type legalizer's work and widen the original
24406 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
24408 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
24409 "Unexpected memory type");
24410 int ShuffleMask[] = {0, 2, -1, -1};
24411 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
24412 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
24413 // Now we have 4 elements instead of 2.
24414 // Expand the index.
24415 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
24416 Index = ExtendToType(Index, NewIndexVT, DAG);
24418 // Expand the mask with zeroes
24419 // Mask may be <2 x i64> or <2 x i1> at this moment
24420 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
24421 "Unexpected mask type");
24422 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
24423 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
24427 unsigned NumElts = VT.getVectorNumElements();
24428 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
24429 !Index.getSimpleValueType().is512BitVector()) {
24430 // AVX512F supports only 512-bit vectors. Or data or index should
24431 // be 512 bit wide. If now the both index and data are 256-bit, but
24432 // the vector contains 8 elements, we just sign-extend the index
24433 if (IndexVT == MVT::v8i32)
24434 // Just extend index
24435 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24437 // The minimal number of elts in scatter is 8
24440 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
24441 // Use original index here, do not modify the index twice
24442 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
24443 if (IndexVT.getScalarType() == MVT::i32)
24444 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24447 // At this point we have promoted mask operand
24448 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
24449 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
24450 // Use the original mask here, do not modify the mask twice
24451 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
24453 // The value that should be stored
24454 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
24455 Src = ExtendToType(Src, NewVT, DAG);
24458 // If the mask is "wide" at this point - truncate it to i1 vector
24459 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
24460 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
24462 // The mask is killed by scatter, add it to the values
24463 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
24464 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
24465 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24466 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24467 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24468 return SDValue(NewScatter.getNode(), 1);
24471 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
24472 SelectionDAG &DAG) {
24474 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
24475 MVT VT = Op.getSimpleValueType();
24476 MVT ScalarVT = VT.getScalarType();
24477 SDValue Mask = N->getMask();
24480 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
24481 "Expanding masked load is supported on AVX-512 target only!");
24483 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
24484 "Expanding masked load is supported for 32 and 64-bit types only!");
24486 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
24487 // VLX. These types for exp-loads are handled here.
24488 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
24491 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24492 "Cannot lower masked load op.");
24494 assert((ScalarVT.getSizeInBits() >= 32 ||
24495 (Subtarget.hasBWI() &&
24496 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
24497 "Unsupported masked load op.");
24499 // This operation is legal for targets with VLX, but without
24500 // VLX the vector should be widened to 512 bit
24501 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
24502 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24503 SDValue Src0 = N->getSrc0();
24504 Src0 = ExtendToType(Src0, WideDataVT, DAG);
24506 // Mask element has to be i1.
24507 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
24508 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
24509 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
24511 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
24513 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24514 if (MaskEltTy != MVT::i1)
24515 Mask = DAG.getNode(ISD::TRUNCATE, dl,
24516 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
24517 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
24518 N->getBasePtr(), Mask, Src0,
24519 N->getMemoryVT(), N->getMemOperand(),
24520 N->getExtensionType(),
24521 N->isExpandingLoad());
24523 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
24524 NewLoad.getValue(0),
24525 DAG.getIntPtrConstant(0, dl));
24526 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
24527 return DAG.getMergeValues(RetOps, dl);
24530 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
24531 SelectionDAG &DAG) {
24532 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
24533 SDValue DataToStore = N->getValue();
24534 MVT VT = DataToStore.getSimpleValueType();
24535 MVT ScalarVT = VT.getScalarType();
24536 SDValue Mask = N->getMask();
24539 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
24540 "Expanding masked load is supported on AVX-512 target only!");
24542 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
24543 "Expanding masked load is supported for 32 and 64-bit types only!");
24545 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
24546 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
24549 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24550 "Cannot lower masked store op.");
24552 assert((ScalarVT.getSizeInBits() >= 32 ||
24553 (Subtarget.hasBWI() &&
24554 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
24555 "Unsupported masked store op.");
24557 // This operation is legal for targets with VLX, but without
24558 // VLX the vector should be widened to 512 bit
24559 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
24560 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24562 // Mask element has to be i1.
24563 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
24564 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
24565 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
24567 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
24569 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
24570 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24571 if (MaskEltTy != MVT::i1)
24572 Mask = DAG.getNode(ISD::TRUNCATE, dl,
24573 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
24574 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
24575 Mask, N->getMemoryVT(), N->getMemOperand(),
24576 N->isTruncatingStore(), N->isCompressingStore());
24579 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
24580 SelectionDAG &DAG) {
24581 assert(Subtarget.hasAVX2() &&
24582 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
24584 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
24586 MVT VT = Op.getSimpleValueType();
24587 SDValue Index = N->getIndex();
24588 SDValue Mask = N->getMask();
24589 SDValue Src0 = N->getValue();
24590 MVT IndexVT = Index.getSimpleValueType();
24591 MVT MaskVT = Mask.getSimpleValueType();
24593 unsigned NumElts = VT.getVectorNumElements();
24594 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
24596 // If the index is v2i32, we're being called by type legalization.
24597 if (IndexVT == MVT::v2i32)
24600 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24601 !Index.getSimpleValueType().is512BitVector()) {
24602 // AVX512F supports only 512-bit vectors. Or data or index should
24603 // be 512 bit wide. If now the both index and data are 256-bit, but
24604 // the vector contains 8 elements, we just sign-extend the index
24605 if (NumElts == 8) {
24606 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24607 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24608 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24609 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24610 N->getMemOperand());
24611 return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
24614 // Minimal number of elements in Gather
24617 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
24618 Index = ExtendToType(Index, NewIndexVT, DAG);
24619 if (IndexVT.getScalarType() == MVT::i32)
24620 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24623 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
24624 // At this point we have promoted mask operand
24625 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
24626 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
24627 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
24628 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
24630 // The pass-through value
24631 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
24632 Src0 = ExtendToType(Src0, NewVT, DAG);
24634 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24635 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24636 DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24637 N->getMemOperand());
24638 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
24639 NewGather.getValue(0),
24640 DAG.getIntPtrConstant(0, dl));
24641 SDValue RetOps[] = {Extract, NewGather.getValue(2)};
24642 return DAG.getMergeValues(RetOps, dl);
24645 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24646 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24647 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24648 N->getMemOperand());
24649 return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
24652 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
24653 SelectionDAG &DAG) const {
24654 // TODO: Eventually, the lowering of these nodes should be informed by or
24655 // deferred to the GC strategy for the function in which they appear. For
24656 // now, however, they must be lowered to something. Since they are logically
24657 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24658 // require special handling for these nodes), lower them as literal NOOPs for
24660 SmallVector<SDValue, 2> Ops;
24662 Ops.push_back(Op.getOperand(0));
24663 if (Op->getGluedNode())
24664 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24667 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24668 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24673 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
24674 SelectionDAG &DAG) const {
24675 // TODO: Eventually, the lowering of these nodes should be informed by or
24676 // deferred to the GC strategy for the function in which they appear. For
24677 // now, however, they must be lowered to something. Since they are logically
24678 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24679 // require special handling for these nodes), lower them as literal NOOPs for
24681 SmallVector<SDValue, 2> Ops;
24683 Ops.push_back(Op.getOperand(0));
24684 if (Op->getGluedNode())
24685 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24688 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24689 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24694 /// Provide custom lowering hooks for some operations.
24695 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24696 switch (Op.getOpcode()) {
24697 default: llvm_unreachable("Should not custom lower this!");
24698 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
24699 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
24700 return LowerCMP_SWAP(Op, Subtarget, DAG);
24701 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
24702 case ISD::ATOMIC_LOAD_ADD:
24703 case ISD::ATOMIC_LOAD_SUB:
24704 case ISD::ATOMIC_LOAD_OR:
24705 case ISD::ATOMIC_LOAD_XOR:
24706 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
24707 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
24708 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
24709 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
24710 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
24711 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
24712 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
24713 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
24714 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
24715 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
24716 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
24717 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
24718 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
24719 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
24720 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
24721 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
24722 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
24723 case ISD::SHL_PARTS:
24724 case ISD::SRA_PARTS:
24725 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
24726 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
24727 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
24728 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
24729 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
24730 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
24731 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
24732 case ISD::ZERO_EXTEND_VECTOR_INREG:
24733 case ISD::SIGN_EXTEND_VECTOR_INREG:
24734 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
24735 case ISD::FP_TO_SINT:
24736 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
24737 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
24738 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
24740 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
24741 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
24742 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
24743 case ISD::SETCC: return LowerSETCC(Op, DAG);
24744 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
24745 case ISD::SELECT: return LowerSELECT(Op, DAG);
24746 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
24747 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
24748 case ISD::VASTART: return LowerVASTART(Op, DAG);
24749 case ISD::VAARG: return LowerVAARG(Op, DAG);
24750 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
24751 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
24752 case ISD::INTRINSIC_VOID:
24753 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24754 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
24755 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
24756 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
24757 case ISD::FRAME_TO_ARGS_OFFSET:
24758 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24759 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24760 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
24761 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
24762 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
24763 case ISD::EH_SJLJ_SETUP_DISPATCH:
24764 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
24765 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
24766 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
24767 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
24769 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
24771 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
24772 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
24774 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
24775 case ISD::UMUL_LOHI:
24776 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
24778 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
24781 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
24787 case ISD::UMULO: return LowerXALUO(Op, DAG);
24788 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
24789 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
24790 case ISD::ADDCARRY:
24791 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
24793 case ISD::SUB: return LowerADD_SUB(Op, DAG);
24797 case ISD::UMIN: return LowerMINMAX(Op, DAG);
24798 case ISD::ABS: return LowerABS(Op, DAG);
24799 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
24800 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
24801 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
24802 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
24803 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
24804 case ISD::GC_TRANSITION_START:
24805 return LowerGC_TRANSITION_START(Op, DAG);
24806 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
24807 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
24811 /// Places new result values for the node in Results (their number
24812 /// and types must exactly match those of the original return values of
24813 /// the node), or leaves Results empty, which indicates that the node is not
24814 /// to be custom lowered after all.
24815 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24816 SmallVectorImpl<SDValue> &Results,
24817 SelectionDAG &DAG) const {
24818 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24820 if (!Res.getNode())
24823 assert((N->getNumValues() <= Res->getNumValues()) &&
24824 "Lowering returned the wrong number of results!");
24826 // Places new result values base on N result number.
24827 // In some cases (LowerSINT_TO_FP for example) Res has more result values
24828 // than original node, chain should be dropped(last value).
24829 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
24830 Results.push_back(Res.getValue(I));
24833 /// Replace a node with an illegal result type with a new node built out of
24835 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24836 SmallVectorImpl<SDValue>&Results,
24837 SelectionDAG &DAG) const {
24839 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24840 switch (N->getOpcode()) {
24842 llvm_unreachable("Do not know how to custom type legalize this operation!");
24843 case X86ISD::AVG: {
24844 // Legalize types for X86ISD::AVG by expanding vectors.
24845 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24847 auto InVT = N->getValueType(0);
24848 auto InVTSize = InVT.getSizeInBits();
24849 const unsigned RegSize =
24850 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
24851 assert((Subtarget.hasBWI() || RegSize < 512) &&
24852 "512-bit vector requires AVX512BW");
24853 assert((Subtarget.hasAVX2() || RegSize < 256) &&
24854 "256-bit vector requires AVX2");
24856 auto ElemVT = InVT.getVectorElementType();
24857 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24858 RegSize / ElemVT.getSizeInBits());
24859 assert(RegSize % InVT.getSizeInBits() == 0);
24860 unsigned NumConcat = RegSize / InVT.getSizeInBits();
24862 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24863 Ops[0] = N->getOperand(0);
24864 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24865 Ops[0] = N->getOperand(1);
24866 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24868 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24869 if (!ExperimentalVectorWideningLegalization)
24870 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24871 DAG.getIntPtrConstant(0, dl));
24872 Results.push_back(Res);
24875 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24876 case X86ISD::FMINC:
24878 case X86ISD::FMAXC:
24879 case X86ISD::FMAX: {
24880 EVT VT = N->getValueType(0);
24881 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
24882 SDValue UNDEF = DAG.getUNDEF(VT);
24883 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24884 N->getOperand(0), UNDEF);
24885 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24886 N->getOperand(1), UNDEF);
24887 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24895 case ISD::UDIVREM: {
24896 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24897 Results.push_back(V);
24900 case ISD::FP_TO_SINT:
24901 case ISD::FP_TO_UINT: {
24902 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24904 if (N->getValueType(0) == MVT::v2i32) {
24905 assert((IsSigned || Subtarget.hasAVX512()) &&
24906 "Can only handle signed conversion without AVX512");
24907 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24908 SDValue Src = N->getOperand(0);
24909 if (Src.getValueType() == MVT::v2f64) {
24910 MVT ResVT = MVT::v4i32;
24911 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
24912 if (!IsSigned && !Subtarget.hasVLX()) {
24913 // Widen to 512-bits.
24914 ResVT = MVT::v8i32;
24915 Opc = ISD::FP_TO_UINT;
24916 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
24917 DAG.getUNDEF(MVT::v8f64),
24918 Src, DAG.getIntPtrConstant(0, dl));
24920 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
24921 ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32
24923 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
24924 DAG.getIntPtrConstant(0, dl));
24925 Results.push_back(Res);
24928 if (Src.getValueType() == MVT::v2f32) {
24929 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24930 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24931 DAG.getUNDEF(MVT::v2f32));
24932 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24933 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24934 if (!ExperimentalVectorWideningLegalization)
24935 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24936 Results.push_back(Res);
24940 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24941 // so early out here.
24945 std::pair<SDValue,SDValue> Vals =
24946 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24947 SDValue FIST = Vals.first, StackSlot = Vals.second;
24948 if (FIST.getNode()) {
24949 EVT VT = N->getValueType(0);
24950 // Return a load from the stack slot.
24951 if (StackSlot.getNode())
24953 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24955 Results.push_back(FIST);
24959 case ISD::SINT_TO_FP: {
24960 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24961 SDValue Src = N->getOperand(0);
24962 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24964 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24967 case ISD::UINT_TO_FP: {
24968 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24969 EVT VT = N->getValueType(0);
24970 if (VT != MVT::v2f32)
24972 SDValue Src = N->getOperand(0);
24973 EVT SrcVT = Src.getValueType();
24974 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24975 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24978 if (SrcVT != MVT::v2i32)
24980 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24982 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24983 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24984 DAG.getBitcast(MVT::v2i64, VBias));
24985 Or = DAG.getBitcast(MVT::v2f64, Or);
24986 // TODO: Are there any fast-math-flags to propagate here?
24987 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24988 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24991 case ISD::FP_ROUND: {
24992 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24994 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24995 Results.push_back(V);
24998 case ISD::FP_EXTEND: {
24999 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
25000 // No other ValueType for FP_EXTEND should reach this point.
25001 assert(N->getValueType(0) == MVT::v2f32 &&
25002 "Do not know how to legalize this Node");
25005 case ISD::INTRINSIC_W_CHAIN: {
25006 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
25008 default : llvm_unreachable("Do not know how to custom type "
25009 "legalize this intrinsic operation!");
25010 case Intrinsic::x86_rdtsc:
25011 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25013 case Intrinsic::x86_rdtscp:
25014 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
25016 case Intrinsic::x86_rdpmc:
25017 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
25019 case Intrinsic::x86_xgetbv:
25020 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
25023 case ISD::INTRINSIC_WO_CHAIN: {
25024 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
25025 Results.push_back(V);
25028 case ISD::READCYCLECOUNTER: {
25029 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25032 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
25033 EVT T = N->getValueType(0);
25034 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
25035 bool Regs64bit = T == MVT::i128;
25036 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
25037 SDValue cpInL, cpInH;
25038 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25039 DAG.getConstant(0, dl, HalfT));
25040 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25041 DAG.getConstant(1, dl, HalfT));
25042 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
25043 Regs64bit ? X86::RAX : X86::EAX,
25045 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
25046 Regs64bit ? X86::RDX : X86::EDX,
25047 cpInH, cpInL.getValue(1));
25048 SDValue swapInL, swapInH;
25049 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25050 DAG.getConstant(0, dl, HalfT));
25051 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25052 DAG.getConstant(1, dl, HalfT));
25054 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
25055 swapInH, cpInH.getValue(1));
25056 // If the current function needs the base pointer, RBX,
25057 // we shouldn't use cmpxchg directly.
25058 // Indeed the lowering of that instruction will clobber
25059 // that register and since RBX will be a reserved register
25060 // the register allocator will not make sure its value will
25061 // be properly saved and restored around this live-range.
25062 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
25064 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
25065 unsigned BasePtr = TRI->getBaseRegister();
25066 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
25067 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
25068 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
25069 // ISel prefers the LCMPXCHG64 variant.
25070 // If that assert breaks, that means it is not the case anymore,
25071 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
25072 // not just EBX. This is a matter of accepting i64 input for that
25073 // pseudo, and restoring into the register of the right wide
25074 // in expand pseudo. Everything else should just work.
25075 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
25076 "Saving only half of the RBX");
25077 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
25078 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
25079 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
25080 Regs64bit ? X86::RBX : X86::EBX,
25081 HalfT, swapInH.getValue(1));
25082 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
25084 /*Glue*/ RBXSave.getValue(2)};
25085 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25088 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
25089 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
25090 Regs64bit ? X86::RBX : X86::EBX, swapInL,
25091 swapInH.getValue(1));
25092 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
25093 swapInL.getValue(1)};
25094 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25096 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
25097 Regs64bit ? X86::RAX : X86::EAX,
25098 HalfT, Result.getValue(1));
25099 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
25100 Regs64bit ? X86::RDX : X86::EDX,
25101 HalfT, cpOutL.getValue(2));
25102 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
25104 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
25105 MVT::i32, cpOutH.getValue(2));
25106 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
25107 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
25109 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
25110 Results.push_back(Success);
25111 Results.push_back(EFLAGS.getValue(1));
25114 case ISD::ATOMIC_SWAP:
25115 case ISD::ATOMIC_LOAD_ADD:
25116 case ISD::ATOMIC_LOAD_SUB:
25117 case ISD::ATOMIC_LOAD_AND:
25118 case ISD::ATOMIC_LOAD_OR:
25119 case ISD::ATOMIC_LOAD_XOR:
25120 case ISD::ATOMIC_LOAD_NAND:
25121 case ISD::ATOMIC_LOAD_MIN:
25122 case ISD::ATOMIC_LOAD_MAX:
25123 case ISD::ATOMIC_LOAD_UMIN:
25124 case ISD::ATOMIC_LOAD_UMAX:
25125 case ISD::ATOMIC_LOAD: {
25126 // Delegate to generic TypeLegalization. Situations we can really handle
25127 // should have already been dealt with by AtomicExpandPass.cpp.
25130 case ISD::BITCAST: {
25131 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25132 EVT DstVT = N->getValueType(0);
25133 EVT SrcVT = N->getOperand(0).getValueType();
25135 if (SrcVT != MVT::f64 ||
25136 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
25139 unsigned NumElts = DstVT.getVectorNumElements();
25140 EVT SVT = DstVT.getVectorElementType();
25141 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
25142 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
25143 MVT::v2f64, N->getOperand(0));
25144 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
25146 if (ExperimentalVectorWideningLegalization) {
25147 // If we are legalizing vectors by widening, we already have the desired
25148 // legal vector type, just return it.
25149 Results.push_back(ToVecInt);
25153 SmallVector<SDValue, 8> Elts;
25154 for (unsigned i = 0, e = NumElts; i != e; ++i)
25155 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
25156 ToVecInt, DAG.getIntPtrConstant(i, dl)));
25158 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
25161 case ISD::MGATHER: {
25162 EVT VT = N->getValueType(0);
25163 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25164 auto *Gather = cast<MaskedGatherSDNode>(N);
25165 SDValue Index = Gather->getIndex();
25166 if (Index.getValueType() != MVT::v2i64)
25168 SDValue Mask = Gather->getMask();
25169 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25170 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25171 Gather->getValue(),
25172 DAG.getUNDEF(MVT::v2f32));
25173 if (!Subtarget.hasVLX()) {
25174 // We need to widen the mask, but the instruction will only use 2
25175 // of its elements. So we can use undef.
25176 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25177 DAG.getUNDEF(MVT::v2i1));
25178 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25180 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25182 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25183 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
25184 Gather->getMemoryVT(), Gather->getMemOperand());
25185 Results.push_back(Res);
25186 Results.push_back(Res.getValue(2));
25189 if (VT == MVT::v2i32) {
25190 auto *Gather = cast<MaskedGatherSDNode>(N);
25191 SDValue Index = Gather->getIndex();
25192 SDValue Mask = Gather->getMask();
25193 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25194 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
25195 Gather->getValue(),
25196 DAG.getUNDEF(MVT::v2i32));
25197 // If the index is v2i64 we can use it directly.
25198 if (Index.getValueType() == MVT::v2i64 &&
25199 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25200 if (!Subtarget.hasVLX()) {
25201 // We need to widen the mask, but the instruction will only use 2
25202 // of its elements. So we can use undef.
25203 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25204 DAG.getUNDEF(MVT::v2i1));
25205 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25207 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25209 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25210 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
25211 Gather->getMemoryVT(), Gather->getMemOperand());
25212 SDValue Chain = Res.getValue(2);
25213 if (!ExperimentalVectorWideningLegalization)
25214 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25215 DAG.getIntPtrConstant(0, dl));
25216 Results.push_back(Res);
25217 Results.push_back(Chain);
25220 EVT IndexVT = Index.getValueType();
25221 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
25222 IndexVT.getScalarType(), 4);
25223 // Otherwise we need to custom widen everything to avoid promotion.
25224 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25225 DAG.getUNDEF(IndexVT));
25226 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25227 DAG.getConstant(0, dl, MVT::v2i1));
25228 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25230 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
25231 Gather->getMemoryVT(), dl, Ops,
25232 Gather->getMemOperand());
25233 SDValue Chain = Res.getValue(1);
25234 if (!ExperimentalVectorWideningLegalization)
25235 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25236 DAG.getIntPtrConstant(0, dl));
25237 Results.push_back(Res);
25238 Results.push_back(Chain);
25246 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
25247 switch ((X86ISD::NodeType)Opcode) {
25248 case X86ISD::FIRST_NUMBER: break;
25249 case X86ISD::BSF: return "X86ISD::BSF";
25250 case X86ISD::BSR: return "X86ISD::BSR";
25251 case X86ISD::SHLD: return "X86ISD::SHLD";
25252 case X86ISD::SHRD: return "X86ISD::SHRD";
25253 case X86ISD::FAND: return "X86ISD::FAND";
25254 case X86ISD::FANDN: return "X86ISD::FANDN";
25255 case X86ISD::FOR: return "X86ISD::FOR";
25256 case X86ISD::FXOR: return "X86ISD::FXOR";
25257 case X86ISD::FILD: return "X86ISD::FILD";
25258 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
25259 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
25260 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
25261 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
25262 case X86ISD::FLD: return "X86ISD::FLD";
25263 case X86ISD::FST: return "X86ISD::FST";
25264 case X86ISD::CALL: return "X86ISD::CALL";
25265 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
25266 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
25267 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
25268 case X86ISD::BT: return "X86ISD::BT";
25269 case X86ISD::CMP: return "X86ISD::CMP";
25270 case X86ISD::COMI: return "X86ISD::COMI";
25271 case X86ISD::UCOMI: return "X86ISD::UCOMI";
25272 case X86ISD::CMPM: return "X86ISD::CMPM";
25273 case X86ISD::CMPMU: return "X86ISD::CMPMU";
25274 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
25275 case X86ISD::SETCC: return "X86ISD::SETCC";
25276 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
25277 case X86ISD::FSETCC: return "X86ISD::FSETCC";
25278 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
25279 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
25280 case X86ISD::CMOV: return "X86ISD::CMOV";
25281 case X86ISD::BRCOND: return "X86ISD::BRCOND";
25282 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
25283 case X86ISD::IRET: return "X86ISD::IRET";
25284 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
25285 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
25286 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
25287 case X86ISD::Wrapper: return "X86ISD::Wrapper";
25288 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
25289 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
25290 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
25291 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
25292 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
25293 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
25294 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
25295 case X86ISD::PINSRB: return "X86ISD::PINSRB";
25296 case X86ISD::PINSRW: return "X86ISD::PINSRW";
25297 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
25298 case X86ISD::ANDNP: return "X86ISD::ANDNP";
25299 case X86ISD::BLENDI: return "X86ISD::BLENDI";
25300 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
25301 case X86ISD::ADDUS: return "X86ISD::ADDUS";
25302 case X86ISD::SUBUS: return "X86ISD::SUBUS";
25303 case X86ISD::HADD: return "X86ISD::HADD";
25304 case X86ISD::HSUB: return "X86ISD::HSUB";
25305 case X86ISD::FHADD: return "X86ISD::FHADD";
25306 case X86ISD::FHSUB: return "X86ISD::FHSUB";
25307 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
25308 case X86ISD::FMAX: return "X86ISD::FMAX";
25309 case X86ISD::FMAXS: return "X86ISD::FMAXS";
25310 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
25311 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
25312 case X86ISD::FMIN: return "X86ISD::FMIN";
25313 case X86ISD::FMINS: return "X86ISD::FMINS";
25314 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
25315 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
25316 case X86ISD::FMAXC: return "X86ISD::FMAXC";
25317 case X86ISD::FMINC: return "X86ISD::FMINC";
25318 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
25319 case X86ISD::FRCP: return "X86ISD::FRCP";
25320 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
25321 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
25322 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
25323 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
25324 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
25325 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
25326 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
25327 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
25328 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
25329 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
25330 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
25331 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
25332 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
25333 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
25334 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
25335 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
25336 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
25337 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
25338 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
25339 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
25340 case X86ISD::LADD: return "X86ISD::LADD";
25341 case X86ISD::LSUB: return "X86ISD::LSUB";
25342 case X86ISD::LOR: return "X86ISD::LOR";
25343 case X86ISD::LXOR: return "X86ISD::LXOR";
25344 case X86ISD::LAND: return "X86ISD::LAND";
25345 case X86ISD::LINC: return "X86ISD::LINC";
25346 case X86ISD::LDEC: return "X86ISD::LDEC";
25347 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
25348 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
25349 case X86ISD::VZEXT: return "X86ISD::VZEXT";
25350 case X86ISD::VSEXT: return "X86ISD::VSEXT";
25351 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
25352 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
25353 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
25354 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
25355 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
25356 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
25357 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
25358 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
25359 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
25360 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
25361 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
25362 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
25363 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
25364 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
25365 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
25366 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
25367 case X86ISD::VSHL: return "X86ISD::VSHL";
25368 case X86ISD::VSRL: return "X86ISD::VSRL";
25369 case X86ISD::VSRA: return "X86ISD::VSRA";
25370 case X86ISD::VSHLI: return "X86ISD::VSHLI";
25371 case X86ISD::VSRLI: return "X86ISD::VSRLI";
25372 case X86ISD::VSRAI: return "X86ISD::VSRAI";
25373 case X86ISD::VSRAV: return "X86ISD::VSRAV";
25374 case X86ISD::VROTLI: return "X86ISD::VROTLI";
25375 case X86ISD::VROTRI: return "X86ISD::VROTRI";
25376 case X86ISD::VPPERM: return "X86ISD::VPPERM";
25377 case X86ISD::CMPP: return "X86ISD::CMPP";
25378 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
25379 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
25380 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
25381 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
25382 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
25383 case X86ISD::ADD: return "X86ISD::ADD";
25384 case X86ISD::SUB: return "X86ISD::SUB";
25385 case X86ISD::ADC: return "X86ISD::ADC";
25386 case X86ISD::SBB: return "X86ISD::SBB";
25387 case X86ISD::SMUL: return "X86ISD::SMUL";
25388 case X86ISD::UMUL: return "X86ISD::UMUL";
25389 case X86ISD::SMUL8: return "X86ISD::SMUL8";
25390 case X86ISD::UMUL8: return "X86ISD::UMUL8";
25391 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
25392 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
25393 case X86ISD::INC: return "X86ISD::INC";
25394 case X86ISD::DEC: return "X86ISD::DEC";
25395 case X86ISD::OR: return "X86ISD::OR";
25396 case X86ISD::XOR: return "X86ISD::XOR";
25397 case X86ISD::AND: return "X86ISD::AND";
25398 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
25399 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
25400 case X86ISD::PTEST: return "X86ISD::PTEST";
25401 case X86ISD::TESTP: return "X86ISD::TESTP";
25402 case X86ISD::TESTM: return "X86ISD::TESTM";
25403 case X86ISD::TESTNM: return "X86ISD::TESTNM";
25404 case X86ISD::KORTEST: return "X86ISD::KORTEST";
25405 case X86ISD::KTEST: return "X86ISD::KTEST";
25406 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
25407 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
25408 case X86ISD::PACKSS: return "X86ISD::PACKSS";
25409 case X86ISD::PACKUS: return "X86ISD::PACKUS";
25410 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
25411 case X86ISD::VALIGN: return "X86ISD::VALIGN";
25412 case X86ISD::VSHLD: return "X86ISD::VSHLD";
25413 case X86ISD::VSHRD: return "X86ISD::VSHRD";
25414 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
25415 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
25416 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
25417 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
25418 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
25419 case X86ISD::SHUFP: return "X86ISD::SHUFP";
25420 case X86ISD::SHUF128: return "X86ISD::SHUF128";
25421 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
25422 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
25423 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
25424 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
25425 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
25426 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
25427 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
25428 case X86ISD::MOVSD: return "X86ISD::MOVSD";
25429 case X86ISD::MOVSS: return "X86ISD::MOVSS";
25430 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
25431 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
25432 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
25433 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
25434 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
25435 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
25436 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
25437 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
25438 case X86ISD::VPERMV: return "X86ISD::VPERMV";
25439 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
25440 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
25441 case X86ISD::VPERMI: return "X86ISD::VPERMI";
25442 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
25443 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
25444 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
25445 case X86ISD::VRANGE: return "X86ISD::VRANGE";
25446 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
25447 case X86ISD::VRANGES: return "X86ISD::VRANGES";
25448 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
25449 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
25450 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
25451 case X86ISD::PSADBW: return "X86ISD::PSADBW";
25452 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
25453 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
25454 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
25455 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
25456 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
25457 case X86ISD::MFENCE: return "X86ISD::MFENCE";
25458 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
25459 case X86ISD::SAHF: return "X86ISD::SAHF";
25460 case X86ISD::RDRAND: return "X86ISD::RDRAND";
25461 case X86ISD::RDSEED: return "X86ISD::RDSEED";
25462 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
25463 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
25464 case X86ISD::VPSHA: return "X86ISD::VPSHA";
25465 case X86ISD::VPSHL: return "X86ISD::VPSHL";
25466 case X86ISD::VPCOM: return "X86ISD::VPCOM";
25467 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
25468 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
25469 case X86ISD::FMSUB: return "X86ISD::FMSUB";
25470 case X86ISD::FNMADD: return "X86ISD::FNMADD";
25471 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
25472 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
25473 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
25474 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
25475 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
25476 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
25477 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
25478 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
25479 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
25480 case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
25481 case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
25482 case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
25483 case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
25484 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
25485 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
25486 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
25487 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
25488 case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
25489 case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
25490 case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
25491 case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
25492 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
25493 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
25494 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
25495 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
25496 case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
25497 case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
25498 case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
25499 case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
25500 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
25501 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
25502 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
25503 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
25504 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
25505 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
25506 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
25507 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
25508 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
25509 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
25510 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
25511 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
25512 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
25513 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
25514 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
25515 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
25516 case X86ISD::XTEST: return "X86ISD::XTEST";
25517 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
25518 case X86ISD::EXPAND: return "X86ISD::EXPAND";
25519 case X86ISD::SELECT: return "X86ISD::SELECT";
25520 case X86ISD::SELECTS: return "X86ISD::SELECTS";
25521 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
25522 case X86ISD::RCP14: return "X86ISD::RCP14";
25523 case X86ISD::RCP14S: return "X86ISD::RCP14S";
25524 case X86ISD::RCP28: return "X86ISD::RCP28";
25525 case X86ISD::RCP28S: return "X86ISD::RCP28S";
25526 case X86ISD::EXP2: return "X86ISD::EXP2";
25527 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
25528 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
25529 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
25530 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
25531 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
25532 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
25533 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
25534 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
25535 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
25536 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
25537 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
25538 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
25539 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
25540 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
25541 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
25542 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
25543 case X86ISD::SCALEF: return "X86ISD::SCALEF";
25544 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
25545 case X86ISD::ADDS: return "X86ISD::ADDS";
25546 case X86ISD::SUBS: return "X86ISD::SUBS";
25547 case X86ISD::AVG: return "X86ISD::AVG";
25548 case X86ISD::MULHRS: return "X86ISD::MULHRS";
25549 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
25550 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
25551 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
25552 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
25553 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
25554 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
25555 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
25556 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
25557 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
25558 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
25559 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
25560 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
25561 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
25562 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
25563 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
25564 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
25565 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
25566 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
25567 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
25568 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
25569 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
25570 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
25571 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
25572 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
25573 case X86ISD::LWPINS: return "X86ISD::LWPINS";
25574 case X86ISD::MGATHER: return "X86ISD::MGATHER";
25575 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
25576 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
25577 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
25578 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
25579 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
25580 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
25581 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
25582 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
25583 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
25588 /// Return true if the addressing mode represented by AM is legal for this
25589 /// target, for a load/store of the specified type.
25590 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
25591 const AddrMode &AM, Type *Ty,
25593 Instruction *I) const {
25594 // X86 supports extremely general addressing modes.
25595 CodeModel::Model M = getTargetMachine().getCodeModel();
25597 // X86 allows a sign-extended 32-bit immediate field as a displacement.
25598 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
25602 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
25604 // If a reference to this global requires an extra load, we can't fold it.
25605 if (isGlobalStubReference(GVFlags))
25608 // If BaseGV requires a register for the PIC base, we cannot also have a
25609 // BaseReg specified.
25610 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
25613 // If lower 4G is not available, then we must use rip-relative addressing.
25614 if ((M != CodeModel::Small || isPositionIndependent()) &&
25615 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
25619 switch (AM.Scale) {
25625 // These scales always work.
25630 // These scales are formed with basereg+scalereg. Only accept if there is
25635 default: // Other stuff never works.
25642 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
25643 unsigned Bits = Ty->getScalarSizeInBits();
25645 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
25646 // particularly cheaper than those without.
25650 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
25651 // shifts just as cheap as scalar ones.
25652 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
25655 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
25656 // fully general vector.
25660 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
25661 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25663 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
25664 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
25665 return NumBits1 > NumBits2;
25668 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
25669 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25672 if (!isTypeLegal(EVT::getEVT(Ty1)))
25675 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
25677 // Assuming the caller doesn't have a zeroext or signext return parameter,
25678 // truncation all the way down to i1 is valid.
25682 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
25683 return isInt<32>(Imm);
25686 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
25687 // Can also use sub to handle negated immediates.
25688 return isInt<32>(Imm);
25691 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
25692 if (!VT1.isInteger() || !VT2.isInteger())
25694 unsigned NumBits1 = VT1.getSizeInBits();
25695 unsigned NumBits2 = VT2.getSizeInBits();
25696 return NumBits1 > NumBits2;
25699 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
25700 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25701 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
25704 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
25705 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25706 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
25709 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
25710 EVT VT1 = Val.getValueType();
25711 if (isZExtFree(VT1, VT2))
25714 if (Val.getOpcode() != ISD::LOAD)
25717 if (!VT1.isSimple() || !VT1.isInteger() ||
25718 !VT2.isSimple() || !VT2.isInteger())
25721 switch (VT1.getSimpleVT().SimpleTy) {
25726 // X86 has 8, 16, and 32-bit zero-extending loads.
25733 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
25736 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
25737 if (!Subtarget.hasAnyFMA())
25740 VT = VT.getScalarType();
25742 if (!VT.isSimple())
25745 switch (VT.getSimpleVT().SimpleTy) {
25756 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
25757 // i16 instructions are longer (0x66 prefix) and potentially slower.
25758 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
25761 /// Targets can use this to indicate that they only support *some*
25762 /// VECTOR_SHUFFLE operations, those with specific masks.
25763 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
25764 /// are assumed to be legal.
25765 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
25766 if (!VT.isSimple())
25769 // Not for i1 vectors
25770 if (VT.getSimpleVT().getScalarType() == MVT::i1)
25773 // Very little shuffling can be done for 64-bit vectors right now.
25774 if (VT.getSimpleVT().getSizeInBits() == 64)
25777 // We only care that the types being shuffled are legal. The lowering can
25778 // handle any possible shuffle mask that results.
25779 return isTypeLegal(VT.getSimpleVT());
25783 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
25785 // Just delegate to the generic legality, clear masks aren't special.
25786 return isShuffleMaskLegal(Mask, VT);
25789 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
25790 // If the subtarget is using retpolines, we need to not generate jump tables.
25791 if (Subtarget.useRetpoline())
25794 // Otherwise, fallback on the generic logic.
25795 return TargetLowering::areJTsAllowed(Fn);
25798 //===----------------------------------------------------------------------===//
25799 // X86 Scheduler Hooks
25800 //===----------------------------------------------------------------------===//
25802 /// Utility function to emit xbegin specifying the start of an RTM region.
25803 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
25804 const TargetInstrInfo *TII) {
25805 DebugLoc DL = MI.getDebugLoc();
25807 const BasicBlock *BB = MBB->getBasicBlock();
25808 MachineFunction::iterator I = ++MBB->getIterator();
25810 // For the v = xbegin(), we generate
25819 // eax = # XABORT_DEF
25823 // v = phi(s0/mainBB, s1/fallBB)
25825 MachineBasicBlock *thisMBB = MBB;
25826 MachineFunction *MF = MBB->getParent();
25827 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25828 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
25829 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25830 MF->insert(I, mainMBB);
25831 MF->insert(I, fallMBB);
25832 MF->insert(I, sinkMBB);
25834 // Transfer the remainder of BB and its successor edges to sinkMBB.
25835 sinkMBB->splice(sinkMBB->begin(), MBB,
25836 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25837 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25839 MachineRegisterInfo &MRI = MF->getRegInfo();
25840 unsigned DstReg = MI.getOperand(0).getReg();
25841 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25842 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25843 unsigned fallDstReg = MRI.createVirtualRegister(RC);
25847 // # fallthrough to mainMBB
25848 // # abortion to fallMBB
25849 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
25850 thisMBB->addSuccessor(mainMBB);
25851 thisMBB->addSuccessor(fallMBB);
25854 // mainDstReg := -1
25855 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
25856 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25857 mainMBB->addSuccessor(sinkMBB);
25860 // ; pseudo instruction to model hardware's definition from XABORT
25861 // EAX := XABORT_DEF
25862 // fallDstReg := EAX
25863 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
25864 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
25866 fallMBB->addSuccessor(sinkMBB);
25869 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
25870 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
25871 .addReg(mainDstReg).addMBB(mainMBB)
25872 .addReg(fallDstReg).addMBB(fallMBB);
25874 MI.eraseFromParent();
25878 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
25879 // or XMM0_V32I8 in AVX all of this code can be replaced with that
25880 // in the .td file.
25881 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
25882 const TargetInstrInfo *TII) {
25884 switch (MI.getOpcode()) {
25885 default: llvm_unreachable("illegal opcode!");
25886 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
25887 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
25888 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
25889 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
25890 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
25891 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
25892 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
25893 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
25896 DebugLoc dl = MI.getDebugLoc();
25897 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25899 unsigned NumArgs = MI.getNumOperands();
25900 for (unsigned i = 1; i < NumArgs; ++i) {
25901 MachineOperand &Op = MI.getOperand(i);
25902 if (!(Op.isReg() && Op.isImplicit()))
25905 if (MI.hasOneMemOperand())
25906 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25908 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25909 .addReg(X86::XMM0);
25911 MI.eraseFromParent();
25915 // FIXME: Custom handling because TableGen doesn't support multiple implicit
25916 // defs in an instruction pattern
25917 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
25918 const TargetInstrInfo *TII) {
25920 switch (MI.getOpcode()) {
25921 default: llvm_unreachable("illegal opcode!");
25922 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
25923 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
25924 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
25925 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
25926 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
25927 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
25928 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
25929 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25932 DebugLoc dl = MI.getDebugLoc();
25933 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25935 unsigned NumArgs = MI.getNumOperands(); // remove the results
25936 for (unsigned i = 1; i < NumArgs; ++i) {
25937 MachineOperand &Op = MI.getOperand(i);
25938 if (!(Op.isReg() && Op.isImplicit()))
25941 if (MI.hasOneMemOperand())
25942 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25944 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25947 MI.eraseFromParent();
25951 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25952 const X86Subtarget &Subtarget) {
25953 DebugLoc dl = MI.getDebugLoc();
25954 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25956 // insert input VAL into EAX
25957 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25958 .addReg(MI.getOperand(0).getReg());
25959 // insert zero to ECX
25960 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25962 // insert zero to EDX
25963 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25965 // insert WRPKRU instruction
25966 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25968 MI.eraseFromParent(); // The pseudo is gone now.
25972 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25973 const X86Subtarget &Subtarget) {
25974 DebugLoc dl = MI.getDebugLoc();
25975 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25977 // insert zero to ECX
25978 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25980 // insert RDPKRU instruction
25981 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25982 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25985 MI.eraseFromParent(); // The pseudo is gone now.
25989 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
25990 const X86Subtarget &Subtarget,
25992 DebugLoc dl = MI.getDebugLoc();
25993 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25994 // Address into RAX/EAX, other two args into ECX, EDX.
25995 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25996 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25997 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25998 for (int i = 0; i < X86::AddrNumOperands; ++i)
25999 MIB.add(MI.getOperand(i));
26001 unsigned ValOps = X86::AddrNumOperands;
26002 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
26003 .addReg(MI.getOperand(ValOps).getReg());
26004 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
26005 .addReg(MI.getOperand(ValOps + 1).getReg());
26007 // The instruction doesn't actually take any operands though.
26008 BuildMI(*BB, MI, dl, TII->get(Opc));
26010 MI.eraseFromParent(); // The pseudo is gone now.
26014 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
26015 const X86Subtarget &Subtarget) {
26016 DebugLoc dl = MI->getDebugLoc();
26017 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26018 // Address into RAX/EAX
26019 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26020 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26021 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26022 for (int i = 0; i < X86::AddrNumOperands; ++i)
26023 MIB.add(MI->getOperand(i));
26025 // The instruction doesn't actually take any operands though.
26026 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
26028 MI->eraseFromParent(); // The pseudo is gone now.
26034 MachineBasicBlock *
26035 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
26036 MachineBasicBlock *MBB) const {
26037 // Emit va_arg instruction on X86-64.
26039 // Operands to this pseudo-instruction:
26040 // 0 ) Output : destination address (reg)
26041 // 1-5) Input : va_list address (addr, i64mem)
26042 // 6 ) ArgSize : Size (in bytes) of vararg type
26043 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
26044 // 8 ) Align : Alignment of type
26045 // 9 ) EFLAGS (implicit-def)
26047 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
26048 static_assert(X86::AddrNumOperands == 5,
26049 "VAARG_64 assumes 5 address operands");
26051 unsigned DestReg = MI.getOperand(0).getReg();
26052 MachineOperand &Base = MI.getOperand(1);
26053 MachineOperand &Scale = MI.getOperand(2);
26054 MachineOperand &Index = MI.getOperand(3);
26055 MachineOperand &Disp = MI.getOperand(4);
26056 MachineOperand &Segment = MI.getOperand(5);
26057 unsigned ArgSize = MI.getOperand(6).getImm();
26058 unsigned ArgMode = MI.getOperand(7).getImm();
26059 unsigned Align = MI.getOperand(8).getImm();
26061 // Memory Reference
26062 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
26063 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26064 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26066 // Machine Information
26067 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26068 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
26069 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
26070 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
26071 DebugLoc DL = MI.getDebugLoc();
26073 // struct va_list {
26076 // i64 overflow_area (address)
26077 // i64 reg_save_area (address)
26079 // sizeof(va_list) = 24
26080 // alignment(va_list) = 8
26082 unsigned TotalNumIntRegs = 6;
26083 unsigned TotalNumXMMRegs = 8;
26084 bool UseGPOffset = (ArgMode == 1);
26085 bool UseFPOffset = (ArgMode == 2);
26086 unsigned MaxOffset = TotalNumIntRegs * 8 +
26087 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
26089 /* Align ArgSize to a multiple of 8 */
26090 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
26091 bool NeedsAlign = (Align > 8);
26093 MachineBasicBlock *thisMBB = MBB;
26094 MachineBasicBlock *overflowMBB;
26095 MachineBasicBlock *offsetMBB;
26096 MachineBasicBlock *endMBB;
26098 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
26099 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
26100 unsigned OffsetReg = 0;
26102 if (!UseGPOffset && !UseFPOffset) {
26103 // If we only pull from the overflow region, we don't create a branch.
26104 // We don't need to alter control flow.
26105 OffsetDestReg = 0; // unused
26106 OverflowDestReg = DestReg;
26108 offsetMBB = nullptr;
26109 overflowMBB = thisMBB;
26112 // First emit code to check if gp_offset (or fp_offset) is below the bound.
26113 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
26114 // If not, pull from overflow_area. (branch to overflowMBB)
26119 // offsetMBB overflowMBB
26124 // Registers for the PHI in endMBB
26125 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
26126 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
26128 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26129 MachineFunction *MF = MBB->getParent();
26130 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26131 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26132 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26134 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26136 // Insert the new basic blocks
26137 MF->insert(MBBIter, offsetMBB);
26138 MF->insert(MBBIter, overflowMBB);
26139 MF->insert(MBBIter, endMBB);
26141 // Transfer the remainder of MBB and its successor edges to endMBB.
26142 endMBB->splice(endMBB->begin(), thisMBB,
26143 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
26144 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
26146 // Make offsetMBB and overflowMBB successors of thisMBB
26147 thisMBB->addSuccessor(offsetMBB);
26148 thisMBB->addSuccessor(overflowMBB);
26150 // endMBB is a successor of both offsetMBB and overflowMBB
26151 offsetMBB->addSuccessor(endMBB);
26152 overflowMBB->addSuccessor(endMBB);
26154 // Load the offset value into a register
26155 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26156 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
26160 .addDisp(Disp, UseFPOffset ? 4 : 0)
26162 .setMemRefs(MMOBegin, MMOEnd);
26164 // Check if there is enough room left to pull this argument.
26165 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
26167 .addImm(MaxOffset + 8 - ArgSizeA8);
26169 // Branch to "overflowMBB" if offset >= max
26170 // Fall through to "offsetMBB" otherwise
26171 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
26172 .addMBB(overflowMBB);
26175 // In offsetMBB, emit code to use the reg_save_area.
26177 assert(OffsetReg != 0);
26179 // Read the reg_save_area address.
26180 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
26181 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
26187 .setMemRefs(MMOBegin, MMOEnd);
26189 // Zero-extend the offset
26190 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
26191 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
26194 .addImm(X86::sub_32bit);
26196 // Add the offset to the reg_save_area to get the final address.
26197 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
26198 .addReg(OffsetReg64)
26199 .addReg(RegSaveReg);
26201 // Compute the offset for the next argument
26202 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26203 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
26205 .addImm(UseFPOffset ? 16 : 8);
26207 // Store it back into the va_list.
26208 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
26212 .addDisp(Disp, UseFPOffset ? 4 : 0)
26214 .addReg(NextOffsetReg)
26215 .setMemRefs(MMOBegin, MMOEnd);
26218 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
26223 // Emit code to use overflow area
26226 // Load the overflow_area address into a register.
26227 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
26228 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
26234 .setMemRefs(MMOBegin, MMOEnd);
26236 // If we need to align it, do so. Otherwise, just copy the address
26237 // to OverflowDestReg.
26239 // Align the overflow address
26240 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
26241 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
26243 // aligned_addr = (addr + (align-1)) & ~(align-1)
26244 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
26245 .addReg(OverflowAddrReg)
26248 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
26250 .addImm(~(uint64_t)(Align-1));
26252 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
26253 .addReg(OverflowAddrReg);
26256 // Compute the next overflow address after this argument.
26257 // (the overflow address should be kept 8-byte aligned)
26258 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
26259 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
26260 .addReg(OverflowDestReg)
26261 .addImm(ArgSizeA8);
26263 // Store the new overflow address.
26264 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
26270 .addReg(NextAddrReg)
26271 .setMemRefs(MMOBegin, MMOEnd);
26273 // If we branched, emit the PHI to the front of endMBB.
26275 BuildMI(*endMBB, endMBB->begin(), DL,
26276 TII->get(X86::PHI), DestReg)
26277 .addReg(OffsetDestReg).addMBB(offsetMBB)
26278 .addReg(OverflowDestReg).addMBB(overflowMBB);
26281 // Erase the pseudo instruction
26282 MI.eraseFromParent();
26287 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
26288 MachineInstr &MI, MachineBasicBlock *MBB) const {
26289 // Emit code to save XMM registers to the stack. The ABI says that the
26290 // number of registers to save is given in %al, so it's theoretically
26291 // possible to do an indirect jump trick to avoid saving all of them,
26292 // however this code takes a simpler approach and just executes all
26293 // of the stores if %al is non-zero. It's less code, and it's probably
26294 // easier on the hardware branch predictor, and stores aren't all that
26295 // expensive anyway.
26297 // Create the new basic blocks. One block contains all the XMM stores,
26298 // and one block is the final destination regardless of whether any
26299 // stores were performed.
26300 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26301 MachineFunction *F = MBB->getParent();
26302 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26303 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
26304 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
26305 F->insert(MBBIter, XMMSaveMBB);
26306 F->insert(MBBIter, EndMBB);
26308 // Transfer the remainder of MBB and its successor edges to EndMBB.
26309 EndMBB->splice(EndMBB->begin(), MBB,
26310 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26311 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
26313 // The original block will now fall through to the XMM save block.
26314 MBB->addSuccessor(XMMSaveMBB);
26315 // The XMMSaveMBB will fall through to the end block.
26316 XMMSaveMBB->addSuccessor(EndMBB);
26318 // Now add the instructions.
26319 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26320 DebugLoc DL = MI.getDebugLoc();
26322 unsigned CountReg = MI.getOperand(0).getReg();
26323 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
26324 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
26326 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
26327 // If %al is 0, branch around the XMM save block.
26328 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
26329 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
26330 MBB->addSuccessor(EndMBB);
26333 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
26334 // that was just emitted, but clearly shouldn't be "saved".
26335 assert((MI.getNumOperands() <= 3 ||
26336 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
26337 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
26338 "Expected last argument to be EFLAGS");
26339 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
26340 // In the XMM save block, save all the XMM argument registers.
26341 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
26342 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
26343 MachineMemOperand *MMO = F->getMachineMemOperand(
26344 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
26345 MachineMemOperand::MOStore,
26346 /*Size=*/16, /*Align=*/16);
26347 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
26348 .addFrameIndex(RegSaveFrameIndex)
26349 .addImm(/*Scale=*/1)
26350 .addReg(/*IndexReg=*/0)
26351 .addImm(/*Disp=*/Offset)
26352 .addReg(/*Segment=*/0)
26353 .addReg(MI.getOperand(i).getReg())
26354 .addMemOperand(MMO);
26357 MI.eraseFromParent(); // The pseudo instruction is gone now.
26362 // The EFLAGS operand of SelectItr might be missing a kill marker
26363 // because there were multiple uses of EFLAGS, and ISel didn't know
26364 // which to mark. Figure out whether SelectItr should have had a
26365 // kill marker, and set it if it should. Returns the correct kill
26367 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
26368 MachineBasicBlock* BB,
26369 const TargetRegisterInfo* TRI) {
26370 // Scan forward through BB for a use/def of EFLAGS.
26371 MachineBasicBlock::iterator miI(std::next(SelectItr));
26372 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
26373 const MachineInstr& mi = *miI;
26374 if (mi.readsRegister(X86::EFLAGS))
26376 if (mi.definesRegister(X86::EFLAGS))
26377 break; // Should have kill-flag - update below.
26380 // If we hit the end of the block, check whether EFLAGS is live into a
26382 if (miI == BB->end()) {
26383 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
26384 sEnd = BB->succ_end();
26385 sItr != sEnd; ++sItr) {
26386 MachineBasicBlock* succ = *sItr;
26387 if (succ->isLiveIn(X86::EFLAGS))
26392 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
26393 // out. SelectMI should have a kill flag on EFLAGS.
26394 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
26398 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
26399 // together with other CMOV pseudo-opcodes into a single basic-block with
26400 // conditional jump around it.
26401 static bool isCMOVPseudo(MachineInstr &MI) {
26402 switch (MI.getOpcode()) {
26403 case X86::CMOV_FR32:
26404 case X86::CMOV_FR64:
26405 case X86::CMOV_GR8:
26406 case X86::CMOV_GR16:
26407 case X86::CMOV_GR32:
26408 case X86::CMOV_RFP32:
26409 case X86::CMOV_RFP64:
26410 case X86::CMOV_RFP80:
26411 case X86::CMOV_V2F64:
26412 case X86::CMOV_V2I64:
26413 case X86::CMOV_V4F32:
26414 case X86::CMOV_V4F64:
26415 case X86::CMOV_V4I64:
26416 case X86::CMOV_V16F32:
26417 case X86::CMOV_V8F32:
26418 case X86::CMOV_V8F64:
26419 case X86::CMOV_V8I64:
26420 case X86::CMOV_V8I1:
26421 case X86::CMOV_V16I1:
26422 case X86::CMOV_V32I1:
26423 case X86::CMOV_V64I1:
26431 // Helper function, which inserts PHI functions into SinkMBB:
26432 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
26433 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
26434 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
26435 // the last PHI function inserted.
26436 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
26437 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
26438 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
26439 MachineBasicBlock *SinkMBB) {
26440 MachineFunction *MF = TrueMBB->getParent();
26441 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
26442 DebugLoc DL = MIItBegin->getDebugLoc();
26444 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
26445 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26447 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
26449 // As we are creating the PHIs, we have to be careful if there is more than
26450 // one. Later CMOVs may reference the results of earlier CMOVs, but later
26451 // PHIs have to reference the individual true/false inputs from earlier PHIs.
26452 // That also means that PHI construction must work forward from earlier to
26453 // later, and that the code must maintain a mapping from earlier PHI's
26454 // destination registers, and the registers that went into the PHI.
26455 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
26456 MachineInstrBuilder MIB;
26458 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
26459 unsigned DestReg = MIIt->getOperand(0).getReg();
26460 unsigned Op1Reg = MIIt->getOperand(1).getReg();
26461 unsigned Op2Reg = MIIt->getOperand(2).getReg();
26463 // If this CMOV we are generating is the opposite condition from
26464 // the jump we generated, then we have to swap the operands for the
26465 // PHI that is going to be generated.
26466 if (MIIt->getOperand(3).getImm() == OppCC)
26467 std::swap(Op1Reg, Op2Reg);
26469 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
26470 Op1Reg = RegRewriteTable[Op1Reg].first;
26472 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
26473 Op2Reg = RegRewriteTable[Op2Reg].second;
26475 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
26481 // Add this PHI to the rewrite table.
26482 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
26488 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
26489 MachineBasicBlock *
26490 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
26491 MachineInstr &SecondCascadedCMOV,
26492 MachineBasicBlock *ThisMBB) const {
26493 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26494 DebugLoc DL = FirstCMOV.getDebugLoc();
26496 // We lower cascaded CMOVs such as
26498 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
26500 // to two successive branches.
26502 // Without this, we would add a PHI between the two jumps, which ends up
26503 // creating a few copies all around. For instance, for
26505 // (sitofp (zext (fcmp une)))
26507 // we would generate:
26509 // ucomiss %xmm1, %xmm0
26510 // movss <1.0f>, %xmm0
26511 // movaps %xmm0, %xmm1
26513 // xorps %xmm1, %xmm1
26516 // movaps %xmm1, %xmm0
26520 // because this custom-inserter would have generated:
26532 // A: X = ...; Y = ...
26534 // C: Z = PHI [X, A], [Y, B]
26536 // E: PHI [X, C], [Z, D]
26538 // If we lower both CMOVs in a single step, we can instead generate:
26550 // A: X = ...; Y = ...
26552 // E: PHI [X, A], [X, C], [Y, D]
26554 // Which, in our sitofp/fcmp example, gives us something like:
26556 // ucomiss %xmm1, %xmm0
26557 // movss <1.0f>, %xmm0
26560 // xorps %xmm0, %xmm0
26565 // We lower cascaded CMOV into two successive branches to the same block.
26566 // EFLAGS is used by both, so mark it as live in the second.
26567 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26568 MachineFunction *F = ThisMBB->getParent();
26569 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26570 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26571 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26573 MachineFunction::iterator It = ++ThisMBB->getIterator();
26574 F->insert(It, FirstInsertedMBB);
26575 F->insert(It, SecondInsertedMBB);
26576 F->insert(It, SinkMBB);
26578 // For a cascaded CMOV, we lower it to two successive branches to
26579 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
26580 // the FirstInsertedMBB.
26581 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
26583 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26584 // live into the sink and copy blocks.
26585 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26586 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
26587 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
26588 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
26589 SinkMBB->addLiveIn(X86::EFLAGS);
26592 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26593 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26594 std::next(MachineBasicBlock::iterator(FirstCMOV)),
26596 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26598 // Fallthrough block for ThisMBB.
26599 ThisMBB->addSuccessor(FirstInsertedMBB);
26600 // The true block target of the first branch is always SinkMBB.
26601 ThisMBB->addSuccessor(SinkMBB);
26602 // Fallthrough block for FirstInsertedMBB.
26603 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
26604 // The true block for the branch of FirstInsertedMBB.
26605 FirstInsertedMBB->addSuccessor(SinkMBB);
26606 // This is fallthrough.
26607 SecondInsertedMBB->addSuccessor(SinkMBB);
26609 // Create the conditional branch instructions.
26610 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
26611 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
26612 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26614 X86::CondCode SecondCC =
26615 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
26616 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
26617 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
26620 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
26621 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
26622 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
26623 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
26624 MachineInstrBuilder MIB =
26625 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
26627 .addMBB(SecondInsertedMBB)
26631 // The second SecondInsertedMBB provides the same incoming value as the
26632 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
26633 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
26634 // Copy the PHI result to the register defined by the second CMOV.
26635 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
26636 TII->get(TargetOpcode::COPY),
26637 SecondCascadedCMOV.getOperand(0).getReg())
26638 .addReg(FirstCMOV.getOperand(0).getReg());
26640 // Now remove the CMOVs.
26641 FirstCMOV.eraseFromParent();
26642 SecondCascadedCMOV.eraseFromParent();
26647 MachineBasicBlock *
26648 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
26649 MachineBasicBlock *ThisMBB) const {
26650 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26651 DebugLoc DL = MI.getDebugLoc();
26653 // To "insert" a SELECT_CC instruction, we actually have to insert the
26654 // diamond control-flow pattern. The incoming instruction knows the
26655 // destination vreg to set, the condition code register to branch on, the
26656 // true/false values to select between and a branch opcode to use.
26661 // cmpTY ccX, r1, r2
26663 // fallthrough --> FalseMBB
26665 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
26666 // as described above, by inserting a BB, and then making a PHI at the join
26667 // point to select the true and false operands of the CMOV in the PHI.
26669 // The code also handles two different cases of multiple CMOV opcodes
26673 // In this case, there are multiple CMOVs in a row, all which are based on
26674 // the same condition setting (or the exact opposite condition setting).
26675 // In this case we can lower all the CMOVs using a single inserted BB, and
26676 // then make a number of PHIs at the join point to model the CMOVs. The only
26677 // trickiness here, is that in a case like:
26679 // t2 = CMOV cond1 t1, f1
26680 // t3 = CMOV cond1 t2, f2
26682 // when rewriting this into PHIs, we have to perform some renaming on the
26683 // temps since you cannot have a PHI operand refer to a PHI result earlier
26684 // in the same block. The "simple" but wrong lowering would be:
26686 // t2 = PHI t1(BB1), f1(BB2)
26687 // t3 = PHI t2(BB1), f2(BB2)
26689 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
26690 // renaming is to note that on the path through BB1, t2 is really just a
26691 // copy of t1, and do that renaming, properly generating:
26693 // t2 = PHI t1(BB1), f1(BB2)
26694 // t3 = PHI t1(BB1), f2(BB2)
26697 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
26698 // function - EmitLoweredCascadedSelect.
26700 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
26701 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26702 MachineInstr *LastCMOV = &MI;
26703 MachineBasicBlock::iterator NextMIIt =
26704 std::next(MachineBasicBlock::iterator(MI));
26706 // Check for case 1, where there are multiple CMOVs with the same condition
26707 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
26708 // number of jumps the most.
26710 if (isCMOVPseudo(MI)) {
26711 // See if we have a string of CMOVS with the same condition.
26712 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
26713 (NextMIIt->getOperand(3).getImm() == CC ||
26714 NextMIIt->getOperand(3).getImm() == OppCC)) {
26715 LastCMOV = &*NextMIIt;
26720 // This checks for case 2, but only do this if we didn't already find
26721 // case 1, as indicated by LastCMOV == MI.
26722 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
26723 NextMIIt->getOpcode() == MI.getOpcode() &&
26724 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
26725 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
26726 NextMIIt->getOperand(1).isKill()) {
26727 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
26730 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26731 MachineFunction *F = ThisMBB->getParent();
26732 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
26733 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26735 MachineFunction::iterator It = ++ThisMBB->getIterator();
26736 F->insert(It, FalseMBB);
26737 F->insert(It, SinkMBB);
26739 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26740 // live into the sink and copy blocks.
26741 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26742 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
26743 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
26744 FalseMBB->addLiveIn(X86::EFLAGS);
26745 SinkMBB->addLiveIn(X86::EFLAGS);
26748 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26749 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26750 std::next(MachineBasicBlock::iterator(LastCMOV)),
26752 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26754 // Fallthrough block for ThisMBB.
26755 ThisMBB->addSuccessor(FalseMBB);
26756 // The true block target of the first (or only) branch is always a SinkMBB.
26757 ThisMBB->addSuccessor(SinkMBB);
26758 // Fallthrough block for FalseMBB.
26759 FalseMBB->addSuccessor(SinkMBB);
26761 // Create the conditional branch instruction.
26762 unsigned Opc = X86::GetCondBranchFromCond(CC);
26763 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26766 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
26768 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
26769 MachineBasicBlock::iterator MIItEnd =
26770 std::next(MachineBasicBlock::iterator(LastCMOV));
26771 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
26773 // Now remove the CMOV(s).
26774 ThisMBB->erase(MIItBegin, MIItEnd);
26779 MachineBasicBlock *
26780 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
26781 MachineBasicBlock *BB) const {
26782 // Combine the following atomic floating-point modification pattern:
26783 // a.store(reg OP a.load(acquire), release)
26784 // Transform them into:
26785 // OPss (%gpr), %xmm
26786 // movss %xmm, (%gpr)
26787 // Or sd equivalent for 64-bit operations.
26789 switch (MI.getOpcode()) {
26790 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
26791 case X86::RELEASE_FADD32mr:
26792 FOp = X86::ADDSSrm;
26793 MOp = X86::MOVSSmr;
26795 case X86::RELEASE_FADD64mr:
26796 FOp = X86::ADDSDrm;
26797 MOp = X86::MOVSDmr;
26800 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26801 DebugLoc DL = MI.getDebugLoc();
26802 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
26803 unsigned ValOpIdx = X86::AddrNumOperands;
26804 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
26805 MachineInstrBuilder MIB =
26806 BuildMI(*BB, MI, DL, TII->get(FOp),
26807 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
26809 for (int i = 0; i < X86::AddrNumOperands; ++i) {
26810 MachineOperand &Operand = MI.getOperand(i);
26811 // Clear any kill flags on register operands as we'll create a second
26812 // instruction using the same address operands.
26813 if (Operand.isReg())
26814 Operand.setIsKill(false);
26817 MachineInstr *FOpMI = MIB;
26818 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
26819 for (int i = 0; i < X86::AddrNumOperands; ++i)
26820 MIB.add(MI.getOperand(i));
26821 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
26822 MI.eraseFromParent(); // The pseudo instruction is gone now.
26826 MachineBasicBlock *
26827 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
26828 MachineBasicBlock *BB) const {
26829 MachineFunction *MF = BB->getParent();
26830 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26831 DebugLoc DL = MI.getDebugLoc();
26832 const BasicBlock *LLVM_BB = BB->getBasicBlock();
26834 assert(MF->shouldSplitStack());
26836 const bool Is64Bit = Subtarget.is64Bit();
26837 const bool IsLP64 = Subtarget.isTarget64BitLP64();
26839 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
26840 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
26843 // ... [Till the alloca]
26844 // If stacklet is not large enough, jump to mallocMBB
26847 // Allocate by subtracting from RSP
26848 // Jump to continueMBB
26851 // Allocate by call to runtime
26855 // [rest of original BB]
26858 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26859 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26860 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26862 MachineRegisterInfo &MRI = MF->getRegInfo();
26863 const TargetRegisterClass *AddrRegClass =
26864 getRegClassFor(getPointerTy(MF->getDataLayout()));
26866 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
26867 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
26868 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
26869 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
26870 sizeVReg = MI.getOperand(1).getReg(),
26872 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
26874 MachineFunction::iterator MBBIter = ++BB->getIterator();
26876 MF->insert(MBBIter, bumpMBB);
26877 MF->insert(MBBIter, mallocMBB);
26878 MF->insert(MBBIter, continueMBB);
26880 continueMBB->splice(continueMBB->begin(), BB,
26881 std::next(MachineBasicBlock::iterator(MI)), BB->end());
26882 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
26884 // Add code to the main basic block to check if the stack limit has been hit,
26885 // and if so, jump to mallocMBB otherwise to bumpMBB.
26886 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
26887 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
26888 .addReg(tmpSPVReg).addReg(sizeVReg);
26889 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
26890 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
26891 .addReg(SPLimitVReg);
26892 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
26894 // bumpMBB simply decreases the stack pointer, since we know the current
26895 // stacklet has enough space.
26896 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
26897 .addReg(SPLimitVReg);
26898 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
26899 .addReg(SPLimitVReg);
26900 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26902 // Calls into a routine in libgcc to allocate more space from the heap.
26903 const uint32_t *RegMask =
26904 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
26906 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
26908 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26909 .addExternalSymbol("__morestack_allocate_stack_space")
26910 .addRegMask(RegMask)
26911 .addReg(X86::RDI, RegState::Implicit)
26912 .addReg(X86::RAX, RegState::ImplicitDefine);
26913 } else if (Is64Bit) {
26914 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
26916 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26917 .addExternalSymbol("__morestack_allocate_stack_space")
26918 .addRegMask(RegMask)
26919 .addReg(X86::EDI, RegState::Implicit)
26920 .addReg(X86::EAX, RegState::ImplicitDefine);
26922 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
26924 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
26925 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
26926 .addExternalSymbol("__morestack_allocate_stack_space")
26927 .addRegMask(RegMask)
26928 .addReg(X86::EAX, RegState::ImplicitDefine);
26932 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
26935 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
26936 .addReg(IsLP64 ? X86::RAX : X86::EAX);
26937 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26939 // Set up the CFG correctly.
26940 BB->addSuccessor(bumpMBB);
26941 BB->addSuccessor(mallocMBB);
26942 mallocMBB->addSuccessor(continueMBB);
26943 bumpMBB->addSuccessor(continueMBB);
26945 // Take care of the PHI nodes.
26946 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
26947 MI.getOperand(0).getReg())
26948 .addReg(mallocPtrVReg)
26950 .addReg(bumpSPPtrVReg)
26953 // Delete the original pseudo instruction.
26954 MI.eraseFromParent();
26957 return continueMBB;
26960 MachineBasicBlock *
26961 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
26962 MachineBasicBlock *BB) const {
26963 MachineFunction *MF = BB->getParent();
26964 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26965 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
26966 DebugLoc DL = MI.getDebugLoc();
26968 assert(!isAsynchronousEHPersonality(
26969 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
26970 "SEH does not use catchret!");
26972 // Only 32-bit EH needs to worry about manually restoring stack pointers.
26973 if (!Subtarget.is32Bit())
26976 // C++ EH creates a new target block to hold the restore code, and wires up
26977 // the new block to the return destination with a normal JMP_4.
26978 MachineBasicBlock *RestoreMBB =
26979 MF->CreateMachineBasicBlock(BB->getBasicBlock());
26980 assert(BB->succ_size() == 1);
26981 MF->insert(std::next(BB->getIterator()), RestoreMBB);
26982 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
26983 BB->addSuccessor(RestoreMBB);
26984 MI.getOperand(0).setMBB(RestoreMBB);
26986 auto RestoreMBBI = RestoreMBB->begin();
26987 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
26988 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
26992 MachineBasicBlock *
26993 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
26994 MachineBasicBlock *BB) const {
26995 MachineFunction *MF = BB->getParent();
26996 const Constant *PerFn = MF->getFunction().getPersonalityFn();
26997 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
26998 // Only 32-bit SEH requires special handling for catchpad.
26999 if (IsSEH && Subtarget.is32Bit()) {
27000 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27001 DebugLoc DL = MI.getDebugLoc();
27002 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
27004 MI.eraseFromParent();
27008 MachineBasicBlock *
27009 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
27010 MachineBasicBlock *BB) const {
27011 // So, here we replace TLSADDR with the sequence:
27012 // adjust_stackdown -> TLSADDR -> adjust_stackup.
27013 // We need this because TLSADDR is lowered into calls
27014 // inside MC, therefore without the two markers shrink-wrapping
27015 // may push the prologue/epilogue pass them.
27016 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27017 DebugLoc DL = MI.getDebugLoc();
27018 MachineFunction &MF = *BB->getParent();
27020 // Emit CALLSEQ_START right before the instruction.
27021 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
27022 MachineInstrBuilder CallseqStart =
27023 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
27024 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
27026 // Emit CALLSEQ_END right after the instruction.
27027 // We don't call erase from parent because we want to keep the
27028 // original instruction around.
27029 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
27030 MachineInstrBuilder CallseqEnd =
27031 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
27032 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
27037 MachineBasicBlock *
27038 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
27039 MachineBasicBlock *BB) const {
27040 // This is pretty easy. We're taking the value that we received from
27041 // our load from the relocation, sticking it in either RDI (x86-64)
27042 // or EAX and doing an indirect call. The return value will then
27043 // be in the normal return register.
27044 MachineFunction *F = BB->getParent();
27045 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27046 DebugLoc DL = MI.getDebugLoc();
27048 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
27049 assert(MI.getOperand(3).isGlobal() && "This should be a global");
27051 // Get a register mask for the lowered call.
27052 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
27053 // proper register mask.
27054 const uint32_t *RegMask =
27055 Subtarget.is64Bit() ?
27056 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
27057 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
27058 if (Subtarget.is64Bit()) {
27059 MachineInstrBuilder MIB =
27060 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
27064 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27065 MI.getOperand(3).getTargetFlags())
27067 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
27068 addDirectMem(MIB, X86::RDI);
27069 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
27070 } else if (!isPositionIndependent()) {
27071 MachineInstrBuilder MIB =
27072 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27076 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27077 MI.getOperand(3).getTargetFlags())
27079 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27080 addDirectMem(MIB, X86::EAX);
27081 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27083 MachineInstrBuilder MIB =
27084 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27085 .addReg(TII->getGlobalBaseReg(F))
27088 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27089 MI.getOperand(3).getTargetFlags())
27091 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27092 addDirectMem(MIB, X86::EAX);
27093 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27096 MI.eraseFromParent(); // The pseudo instruction is gone now.
27100 static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
27102 case X86::RETPOLINE_CALL32:
27103 return X86::CALLpcrel32;
27104 case X86::RETPOLINE_CALL64:
27105 return X86::CALL64pcrel32;
27106 case X86::RETPOLINE_TCRETURN32:
27107 return X86::TCRETURNdi;
27108 case X86::RETPOLINE_TCRETURN64:
27109 return X86::TCRETURNdi64;
27111 llvm_unreachable("not retpoline opcode");
27114 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
27116 if (Subtarget.useRetpolineExternalThunk()) {
27117 // When using an external thunk for retpolines, we pick names that match the
27118 // names GCC happens to use as well. This helps simplify the implementation
27119 // of the thunks for kernels where they have no easy ability to create
27120 // aliases and are doing non-trivial configuration of the thunk's body. For
27121 // example, the Linux kernel will do boot-time hot patching of the thunk
27122 // bodies and cannot easily export aliases of these to loaded modules.
27124 // Note that at any point in the future, we may need to change the semantics
27125 // of how we implement retpolines and at that time will likely change the
27126 // name of the called thunk. Essentially, there is no hard guarantee that
27127 // LLVM will generate calls to specific thunks, we merely make a best-effort
27128 // attempt to help out kernels and other systems where duplicating the
27129 // thunks is costly.
27132 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27133 return "__x86_indirect_thunk_eax";
27135 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27136 return "__x86_indirect_thunk_ecx";
27138 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27139 return "__x86_indirect_thunk_edx";
27141 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27142 return "__x86_indirect_thunk_edi";
27144 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27145 return "__x86_indirect_thunk_r11";
27147 llvm_unreachable("unexpected reg for retpoline");
27150 // When targeting an internal COMDAT thunk use an LLVM-specific name.
27153 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27154 return "__llvm_retpoline_eax";
27156 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27157 return "__llvm_retpoline_ecx";
27159 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27160 return "__llvm_retpoline_edx";
27162 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27163 return "__llvm_retpoline_edi";
27165 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27166 return "__llvm_retpoline_r11";
27168 llvm_unreachable("unexpected reg for retpoline");
27171 MachineBasicBlock *
27172 X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
27173 MachineBasicBlock *BB) const {
27174 // Copy the virtual register into the R11 physical register and
27175 // call the retpoline thunk.
27176 DebugLoc DL = MI.getDebugLoc();
27177 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27178 unsigned CalleeVReg = MI.getOperand(0).getReg();
27179 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
27181 // Find an available scratch register to hold the callee. On 64-bit, we can
27182 // just use R11, but we scan for uses anyway to ensure we don't generate
27183 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
27184 // already a register use operand to the call to hold the callee. If none
27185 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
27186 // register and ESI is the base pointer to realigned stack frames with VLAs.
27187 SmallVector<unsigned, 3> AvailableRegs;
27188 if (Subtarget.is64Bit())
27189 AvailableRegs.push_back(X86::R11);
27191 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
27193 // Zero out any registers that are already used.
27194 for (const auto &MO : MI.operands()) {
27195 if (MO.isReg() && MO.isUse())
27196 for (unsigned &Reg : AvailableRegs)
27197 if (Reg == MO.getReg())
27201 // Choose the first remaining non-zero available register.
27202 unsigned AvailableReg = 0;
27203 for (unsigned MaybeReg : AvailableRegs) {
27205 AvailableReg = MaybeReg;
27210 report_fatal_error("calling convention incompatible with retpoline, no "
27211 "available registers");
27213 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
27215 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
27216 .addReg(CalleeVReg);
27217 MI.getOperand(0).ChangeToES(Symbol);
27218 MI.setDesc(TII->get(Opc));
27219 MachineInstrBuilder(*BB->getParent(), &MI)
27220 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
27224 MachineBasicBlock *
27225 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
27226 MachineBasicBlock *MBB) const {
27227 DebugLoc DL = MI.getDebugLoc();
27228 MachineFunction *MF = MBB->getParent();
27229 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27230 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27231 MachineRegisterInfo &MRI = MF->getRegInfo();
27233 const BasicBlock *BB = MBB->getBasicBlock();
27234 MachineFunction::iterator I = ++MBB->getIterator();
27236 // Memory Reference
27237 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27238 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27241 unsigned MemOpndSlot = 0;
27243 unsigned CurOp = 0;
27245 DstReg = MI.getOperand(CurOp++).getReg();
27246 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
27247 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
27249 unsigned mainDstReg = MRI.createVirtualRegister(RC);
27250 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
27252 MemOpndSlot = CurOp;
27254 MVT PVT = getPointerTy(MF->getDataLayout());
27255 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27256 "Invalid Pointer Size!");
27258 // For v = setjmp(buf), we generate
27261 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
27262 // SjLjSetup restoreMBB
27268 // v = phi(main, restore)
27271 // if base pointer being used, load it from frame
27274 MachineBasicBlock *thisMBB = MBB;
27275 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
27276 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
27277 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
27278 MF->insert(I, mainMBB);
27279 MF->insert(I, sinkMBB);
27280 MF->push_back(restoreMBB);
27281 restoreMBB->setHasAddressTaken();
27283 MachineInstrBuilder MIB;
27285 // Transfer the remainder of BB and its successor edges to sinkMBB.
27286 sinkMBB->splice(sinkMBB->begin(), MBB,
27287 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
27288 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
27291 unsigned PtrStoreOpc = 0;
27292 unsigned LabelReg = 0;
27293 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27294 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27295 !isPositionIndependent();
27297 // Prepare IP either in reg or imm.
27298 if (!UseImmLabel) {
27299 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27300 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27301 LabelReg = MRI.createVirtualRegister(PtrRC);
27302 if (Subtarget.is64Bit()) {
27303 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
27307 .addMBB(restoreMBB)
27310 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
27311 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
27312 .addReg(XII->getGlobalBaseReg(MF))
27315 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
27319 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27321 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
27322 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27323 if (i == X86::AddrDisp)
27324 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
27326 MIB.add(MI.getOperand(MemOpndSlot + i));
27329 MIB.addReg(LabelReg);
27331 MIB.addMBB(restoreMBB);
27332 MIB.setMemRefs(MMOBegin, MMOEnd);
27334 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
27335 .addMBB(restoreMBB);
27337 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27338 MIB.addRegMask(RegInfo->getNoPreservedMask());
27339 thisMBB->addSuccessor(mainMBB);
27340 thisMBB->addSuccessor(restoreMBB);
27344 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
27345 mainMBB->addSuccessor(sinkMBB);
27348 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
27349 TII->get(X86::PHI), DstReg)
27350 .addReg(mainDstReg).addMBB(mainMBB)
27351 .addReg(restoreDstReg).addMBB(restoreMBB);
27354 if (RegInfo->hasBasePointer(*MF)) {
27355 const bool Uses64BitFramePtr =
27356 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27357 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
27358 X86FI->setRestoreBasePointer(MF);
27359 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
27360 unsigned BasePtr = RegInfo->getBaseRegister();
27361 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
27362 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
27363 FramePtr, true, X86FI->getRestoreBasePointerOffset())
27364 .setMIFlag(MachineInstr::FrameSetup);
27366 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
27367 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
27368 restoreMBB->addSuccessor(sinkMBB);
27370 MI.eraseFromParent();
27374 MachineBasicBlock *
27375 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
27376 MachineBasicBlock *MBB) const {
27377 DebugLoc DL = MI.getDebugLoc();
27378 MachineFunction *MF = MBB->getParent();
27379 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27380 MachineRegisterInfo &MRI = MF->getRegInfo();
27382 // Memory Reference
27383 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27384 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27386 MVT PVT = getPointerTy(MF->getDataLayout());
27387 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27388 "Invalid Pointer Size!");
27390 const TargetRegisterClass *RC =
27391 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
27392 unsigned Tmp = MRI.createVirtualRegister(RC);
27393 // Since FP is only updated here but NOT referenced, it's treated as GPR.
27394 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27395 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
27396 unsigned SP = RegInfo->getStackRegister();
27398 MachineInstrBuilder MIB;
27400 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27401 const int64_t SPOffset = 2 * PVT.getStoreSize();
27403 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
27404 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
27407 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
27408 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
27409 MIB.add(MI.getOperand(i));
27410 MIB.setMemRefs(MMOBegin, MMOEnd);
27412 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
27413 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27414 if (i == X86::AddrDisp)
27415 MIB.addDisp(MI.getOperand(i), LabelOffset);
27417 MIB.add(MI.getOperand(i));
27419 MIB.setMemRefs(MMOBegin, MMOEnd);
27421 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
27422 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27423 if (i == X86::AddrDisp)
27424 MIB.addDisp(MI.getOperand(i), SPOffset);
27426 MIB.add(MI.getOperand(i));
27428 MIB.setMemRefs(MMOBegin, MMOEnd);
27430 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
27432 MI.eraseFromParent();
27436 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
27437 MachineBasicBlock *MBB,
27438 MachineBasicBlock *DispatchBB,
27440 DebugLoc DL = MI.getDebugLoc();
27441 MachineFunction *MF = MBB->getParent();
27442 MachineRegisterInfo *MRI = &MF->getRegInfo();
27443 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27445 MVT PVT = getPointerTy(MF->getDataLayout());
27446 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
27451 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27452 !isPositionIndependent();
27455 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27457 const TargetRegisterClass *TRC =
27458 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
27459 VR = MRI->createVirtualRegister(TRC);
27460 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27462 if (Subtarget.is64Bit())
27463 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
27467 .addMBB(DispatchBB)
27470 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
27471 .addReg(0) /* TII->getGlobalBaseReg(MF) */
27474 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
27478 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
27479 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
27481 MIB.addMBB(DispatchBB);
27486 MachineBasicBlock *
27487 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
27488 MachineBasicBlock *BB) const {
27489 DebugLoc DL = MI.getDebugLoc();
27490 MachineFunction *MF = BB->getParent();
27491 MachineFrameInfo &MFI = MF->getFrameInfo();
27492 MachineRegisterInfo *MRI = &MF->getRegInfo();
27493 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27494 int FI = MFI.getFunctionContextIndex();
27496 // Get a mapping of the call site numbers to all of the landing pads they're
27497 // associated with.
27498 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
27499 unsigned MaxCSNum = 0;
27500 for (auto &MBB : *MF) {
27501 if (!MBB.isEHPad())
27504 MCSymbol *Sym = nullptr;
27505 for (const auto &MI : MBB) {
27506 if (MI.isDebugValue())
27509 assert(MI.isEHLabel() && "expected EH_LABEL");
27510 Sym = MI.getOperand(0).getMCSymbol();
27514 if (!MF->hasCallSiteLandingPad(Sym))
27517 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
27518 CallSiteNumToLPad[CSI].push_back(&MBB);
27519 MaxCSNum = std::max(MaxCSNum, CSI);
27523 // Get an ordered list of the machine basic blocks for the jump table.
27524 std::vector<MachineBasicBlock *> LPadList;
27525 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
27526 LPadList.reserve(CallSiteNumToLPad.size());
27528 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
27529 for (auto &LP : CallSiteNumToLPad[CSI]) {
27530 LPadList.push_back(LP);
27531 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
27535 assert(!LPadList.empty() &&
27536 "No landing pad destinations for the dispatch jump table!");
27538 // Create the MBBs for the dispatch code.
27540 // Shove the dispatch's address into the return slot in the function context.
27541 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
27542 DispatchBB->setIsEHPad(true);
27544 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
27545 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
27546 DispatchBB->addSuccessor(TrapBB);
27548 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
27549 DispatchBB->addSuccessor(DispContBB);
27552 MF->push_back(DispatchBB);
27553 MF->push_back(DispContBB);
27554 MF->push_back(TrapBB);
27556 // Insert code into the entry block that creates and registers the function
27558 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
27560 // Create the jump table and associated information
27561 unsigned JTE = getJumpTableEncoding();
27562 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
27563 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
27565 const X86RegisterInfo &RI = TII->getRegisterInfo();
27566 // Add a register mask with no preserved registers. This results in all
27567 // registers being marked as clobbered.
27568 if (RI.hasBasePointer(*MF)) {
27569 const bool FPIs64Bit =
27570 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27571 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
27572 MFI->setRestoreBasePointer(MF);
27574 unsigned FP = RI.getFrameRegister(*MF);
27575 unsigned BP = RI.getBaseRegister();
27576 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
27577 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
27578 MFI->getRestoreBasePointerOffset())
27579 .addRegMask(RI.getNoPreservedMask());
27581 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
27582 .addRegMask(RI.getNoPreservedMask());
27585 // IReg is used as an index in a memory operand and therefore can't be SP
27586 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
27587 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
27588 Subtarget.is64Bit() ? 8 : 4);
27589 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
27591 .addImm(LPadList.size());
27592 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
27594 if (Subtarget.is64Bit()) {
27595 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27596 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
27598 // leaq .LJTI0_0(%rip), BReg
27599 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
27603 .addJumpTableIndex(MJTI)
27605 // movzx IReg64, IReg
27606 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
27609 .addImm(X86::sub_32bit);
27612 case MachineJumpTableInfo::EK_BlockAddress:
27613 // jmpq *(BReg,IReg64,8)
27614 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
27621 case MachineJumpTableInfo::EK_LabelDifference32: {
27622 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
27623 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
27624 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27626 // movl (BReg,IReg64,4), OReg
27627 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
27633 // movsx OReg64, OReg
27634 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
27635 // addq BReg, OReg64, TReg
27636 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
27640 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
27644 llvm_unreachable("Unexpected jump table encoding");
27647 // jmpl *.LJTI0_0(,IReg,4)
27648 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
27652 .addJumpTableIndex(MJTI)
27656 // Add the jump table entries as successors to the MBB.
27657 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
27658 for (auto &LP : LPadList)
27659 if (SeenMBBs.insert(LP).second)
27660 DispContBB->addSuccessor(LP);
27662 // N.B. the order the invoke BBs are processed in doesn't matter here.
27663 SmallVector<MachineBasicBlock *, 64> MBBLPads;
27664 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
27665 for (MachineBasicBlock *MBB : InvokeBBs) {
27666 // Remove the landing pad successor from the invoke block and replace it
27667 // with the new dispatch block.
27668 // Keep a copy of Successors since it's modified inside the loop.
27669 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
27671 // FIXME: Avoid quadratic complexity.
27672 for (auto MBBS : Successors) {
27673 if (MBBS->isEHPad()) {
27674 MBB->removeSuccessor(MBBS);
27675 MBBLPads.push_back(MBBS);
27679 MBB->addSuccessor(DispatchBB);
27681 // Find the invoke call and mark all of the callee-saved registers as
27682 // 'implicit defined' so that they're spilled. This prevents code from
27683 // moving instructions to before the EH block, where they will never be
27685 for (auto &II : reverse(*MBB)) {
27689 DenseMap<unsigned, bool> DefRegs;
27690 for (auto &MOp : II.operands())
27692 DefRegs[MOp.getReg()] = true;
27694 MachineInstrBuilder MIB(*MF, &II);
27695 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
27696 unsigned Reg = SavedRegs[RI];
27698 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
27705 // Mark all former landing pads as non-landing pads. The dispatch is the only
27706 // landing pad now.
27707 for (auto &LP : MBBLPads)
27708 LP->setIsEHPad(false);
27710 // The instruction is gone now.
27711 MI.eraseFromParent();
27715 MachineBasicBlock *
27716 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
27717 MachineBasicBlock *BB) const {
27718 MachineFunction *MF = BB->getParent();
27719 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27720 DebugLoc DL = MI.getDebugLoc();
27722 switch (MI.getOpcode()) {
27723 default: llvm_unreachable("Unexpected instr type to insert");
27724 case X86::TAILJMPd64:
27725 case X86::TAILJMPr64:
27726 case X86::TAILJMPm64:
27727 case X86::TAILJMPr64_REX:
27728 case X86::TAILJMPm64_REX:
27729 llvm_unreachable("TAILJMP64 would not be touched here.");
27730 case X86::TCRETURNdi64:
27731 case X86::TCRETURNri64:
27732 case X86::TCRETURNmi64:
27734 case X86::TLS_addr32:
27735 case X86::TLS_addr64:
27736 case X86::TLS_base_addr32:
27737 case X86::TLS_base_addr64:
27738 return EmitLoweredTLSAddr(MI, BB);
27739 case X86::RETPOLINE_CALL32:
27740 case X86::RETPOLINE_CALL64:
27741 case X86::RETPOLINE_TCRETURN32:
27742 case X86::RETPOLINE_TCRETURN64:
27743 return EmitLoweredRetpoline(MI, BB);
27744 case X86::CATCHRET:
27745 return EmitLoweredCatchRet(MI, BB);
27746 case X86::CATCHPAD:
27747 return EmitLoweredCatchPad(MI, BB);
27748 case X86::SEG_ALLOCA_32:
27749 case X86::SEG_ALLOCA_64:
27750 return EmitLoweredSegAlloca(MI, BB);
27751 case X86::TLSCall_32:
27752 case X86::TLSCall_64:
27753 return EmitLoweredTLSCall(MI, BB);
27754 case X86::CMOV_FR32:
27755 case X86::CMOV_FR64:
27756 case X86::CMOV_FR128:
27757 case X86::CMOV_GR8:
27758 case X86::CMOV_GR16:
27759 case X86::CMOV_GR32:
27760 case X86::CMOV_RFP32:
27761 case X86::CMOV_RFP64:
27762 case X86::CMOV_RFP80:
27763 case X86::CMOV_V2F64:
27764 case X86::CMOV_V2I64:
27765 case X86::CMOV_V4F32:
27766 case X86::CMOV_V4F64:
27767 case X86::CMOV_V4I64:
27768 case X86::CMOV_V16F32:
27769 case X86::CMOV_V8F32:
27770 case X86::CMOV_V8F64:
27771 case X86::CMOV_V8I64:
27772 case X86::CMOV_V8I1:
27773 case X86::CMOV_V16I1:
27774 case X86::CMOV_V32I1:
27775 case X86::CMOV_V64I1:
27776 return EmitLoweredSelect(MI, BB);
27778 case X86::RDFLAGS32:
27779 case X86::RDFLAGS64: {
27781 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
27782 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
27783 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
27784 // Permit reads of the FLAGS register without it being defined.
27785 // This intrinsic exists to read external processor state in flags, such as
27786 // the trap flag, interrupt flag, and direction flag, none of which are
27787 // modeled by the backend.
27788 Push->getOperand(2).setIsUndef();
27789 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
27791 MI.eraseFromParent(); // The pseudo is gone now.
27795 case X86::WRFLAGS32:
27796 case X86::WRFLAGS64: {
27798 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
27800 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
27801 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
27802 BuildMI(*BB, MI, DL, TII->get(PopF));
27804 MI.eraseFromParent(); // The pseudo is gone now.
27808 case X86::RELEASE_FADD32mr:
27809 case X86::RELEASE_FADD64mr:
27810 return EmitLoweredAtomicFP(MI, BB);
27812 case X86::FP32_TO_INT16_IN_MEM:
27813 case X86::FP32_TO_INT32_IN_MEM:
27814 case X86::FP32_TO_INT64_IN_MEM:
27815 case X86::FP64_TO_INT16_IN_MEM:
27816 case X86::FP64_TO_INT32_IN_MEM:
27817 case X86::FP64_TO_INT64_IN_MEM:
27818 case X86::FP80_TO_INT16_IN_MEM:
27819 case X86::FP80_TO_INT32_IN_MEM:
27820 case X86::FP80_TO_INT64_IN_MEM: {
27821 // Change the floating point control register to use "round towards zero"
27822 // mode when truncating to an integer value.
27823 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
27824 addFrameReference(BuildMI(*BB, MI, DL,
27825 TII->get(X86::FNSTCW16m)), CWFrameIdx);
27827 // Load the old value of the high byte of the control word...
27829 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
27830 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
27833 // Set the high part to be round to zero...
27834 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
27837 // Reload the modified control word now...
27838 addFrameReference(BuildMI(*BB, MI, DL,
27839 TII->get(X86::FLDCW16m)), CWFrameIdx);
27841 // Restore the memory image of control word to original value
27842 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
27845 // Get the X86 opcode to use.
27847 switch (MI.getOpcode()) {
27848 default: llvm_unreachable("illegal opcode!");
27849 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
27850 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
27851 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
27852 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
27853 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
27854 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
27855 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
27856 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
27857 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
27860 X86AddressMode AM = getAddressFromInstr(&MI, 0);
27861 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
27862 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
27864 // Reload the original control word now.
27865 addFrameReference(BuildMI(*BB, MI, DL,
27866 TII->get(X86::FLDCW16m)), CWFrameIdx);
27868 MI.eraseFromParent(); // The pseudo instruction is gone now.
27871 // String/text processing lowering.
27872 case X86::PCMPISTRM128REG:
27873 case X86::VPCMPISTRM128REG:
27874 case X86::PCMPISTRM128MEM:
27875 case X86::VPCMPISTRM128MEM:
27876 case X86::PCMPESTRM128REG:
27877 case X86::VPCMPESTRM128REG:
27878 case X86::PCMPESTRM128MEM:
27879 case X86::VPCMPESTRM128MEM:
27880 assert(Subtarget.hasSSE42() &&
27881 "Target must have SSE4.2 or AVX features enabled");
27882 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
27884 // String/text processing lowering.
27885 case X86::PCMPISTRIREG:
27886 case X86::VPCMPISTRIREG:
27887 case X86::PCMPISTRIMEM:
27888 case X86::VPCMPISTRIMEM:
27889 case X86::PCMPESTRIREG:
27890 case X86::VPCMPESTRIREG:
27891 case X86::PCMPESTRIMEM:
27892 case X86::VPCMPESTRIMEM:
27893 assert(Subtarget.hasSSE42() &&
27894 "Target must have SSE4.2 or AVX features enabled");
27895 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
27897 // Thread synchronization.
27899 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
27900 case X86::MONITORX:
27901 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
27905 return emitClzero(&MI, BB, Subtarget);
27909 return emitWRPKRU(MI, BB, Subtarget);
27911 return emitRDPKRU(MI, BB, Subtarget);
27914 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
27916 case X86::VASTART_SAVE_XMM_REGS:
27917 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
27919 case X86::VAARG_64:
27920 return EmitVAARG64WithCustomInserter(MI, BB);
27922 case X86::EH_SjLj_SetJmp32:
27923 case X86::EH_SjLj_SetJmp64:
27924 return emitEHSjLjSetJmp(MI, BB);
27926 case X86::EH_SjLj_LongJmp32:
27927 case X86::EH_SjLj_LongJmp64:
27928 return emitEHSjLjLongJmp(MI, BB);
27930 case X86::Int_eh_sjlj_setup_dispatch:
27931 return EmitSjLjDispatchBlock(MI, BB);
27933 case TargetOpcode::STATEPOINT:
27934 // As an implementation detail, STATEPOINT shares the STACKMAP format at
27935 // this point in the process. We diverge later.
27936 return emitPatchPoint(MI, BB);
27938 case TargetOpcode::STACKMAP:
27939 case TargetOpcode::PATCHPOINT:
27940 return emitPatchPoint(MI, BB);
27942 case TargetOpcode::PATCHABLE_EVENT_CALL:
27943 // Do nothing here, handle in xray instrumentation pass.
27946 case X86::LCMPXCHG8B: {
27947 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
27948 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
27949 // requires a memory operand. If it happens that current architecture is
27950 // i686 and for current function we need a base pointer
27951 // - which is ESI for i686 - register allocator would not be able to
27952 // allocate registers for an address in form of X(%reg, %reg, Y)
27953 // - there never would be enough unreserved registers during regalloc
27954 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
27955 // We are giving a hand to register allocator by precomputing the address in
27956 // a new vreg using LEA.
27958 // If it is not i686 or there is no base pointer - nothing to do here.
27959 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
27962 // Even though this code does not necessarily needs the base pointer to
27963 // be ESI, we check for that. The reason: if this assert fails, there are
27964 // some changes happened in the compiler base pointer handling, which most
27965 // probably have to be addressed somehow here.
27966 assert(TRI->getBaseRegister() == X86::ESI &&
27967 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
27968 "base pointer in mind");
27970 MachineRegisterInfo &MRI = MF->getRegInfo();
27971 MVT SPTy = getPointerTy(MF->getDataLayout());
27972 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27973 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
27975 X86AddressMode AM = getAddressFromInstr(&MI, 0);
27976 // Regalloc does not need any help when the memory operand of CMPXCHG8B
27977 // does not use index register.
27978 if (AM.IndexReg == X86::NoRegister)
27981 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
27982 // four operand definitions that are E[ABCD] registers. We skip them and
27983 // then insert the LEA.
27984 MachineBasicBlock::iterator MBBI(MI);
27985 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
27986 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
27989 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
27991 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
27995 case X86::LCMPXCHG16B:
27997 case X86::LCMPXCHG8B_SAVE_EBX:
27998 case X86::LCMPXCHG16B_SAVE_RBX: {
28000 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
28001 if (!BB->isLiveIn(BasePtr))
28002 BB->addLiveIn(BasePtr);
28008 //===----------------------------------------------------------------------===//
28009 // X86 Optimization Hooks
28010 //===----------------------------------------------------------------------===//
28012 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
28014 const APInt &DemandedElts,
28015 const SelectionDAG &DAG,
28016 unsigned Depth) const {
28017 unsigned BitWidth = Known.getBitWidth();
28018 unsigned Opc = Op.getOpcode();
28019 EVT VT = Op.getValueType();
28020 assert((Opc >= ISD::BUILTIN_OP_END ||
28021 Opc == ISD::INTRINSIC_WO_CHAIN ||
28022 Opc == ISD::INTRINSIC_W_CHAIN ||
28023 Opc == ISD::INTRINSIC_VOID) &&
28024 "Should use MaskedValueIsZero if you don't know whether Op"
28025 " is a target node!");
28030 case X86ISD::SETCC:
28031 Known.Zero.setBitsFrom(1);
28033 case X86ISD::MOVMSK: {
28034 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
28035 Known.Zero.setBitsFrom(NumLoBits);
28038 case X86ISD::PEXTRB:
28039 case X86ISD::PEXTRW: {
28040 SDValue Src = Op.getOperand(0);
28041 EVT SrcVT = Src.getValueType();
28042 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
28043 Op.getConstantOperandVal(1));
28044 DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
28045 Known = Known.zextOrTrunc(BitWidth);
28046 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
28049 case X86ISD::VSHLI:
28050 case X86ISD::VSRLI: {
28051 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
28052 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
28053 Known.setAllZero();
28057 DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
28058 unsigned ShAmt = ShiftImm->getZExtValue();
28059 if (Opc == X86ISD::VSHLI) {
28060 Known.Zero <<= ShAmt;
28061 Known.One <<= ShAmt;
28062 // Low bits are known zero.
28063 Known.Zero.setLowBits(ShAmt);
28065 Known.Zero.lshrInPlace(ShAmt);
28066 Known.One.lshrInPlace(ShAmt);
28067 // High bits are known zero.
28068 Known.Zero.setHighBits(ShAmt);
28073 case X86ISD::VZEXT: {
28074 // TODO: Add DemandedElts support.
28075 SDValue N0 = Op.getOperand(0);
28076 unsigned NumElts = VT.getVectorNumElements();
28078 EVT SrcVT = N0.getValueType();
28079 unsigned InNumElts = SrcVT.getVectorNumElements();
28080 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
28081 assert(InNumElts >= NumElts && "Illegal VZEXT input");
28083 Known = KnownBits(InBitWidth);
28084 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
28085 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
28086 Known = Known.zext(BitWidth);
28087 Known.Zero.setBitsFrom(InBitWidth);
28090 case X86ISD::CMOV: {
28091 DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
28092 // If we don't know any bits, early out.
28093 if (Known.isUnknown())
28096 DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
28098 // Only known if known in both the LHS and RHS.
28099 Known.One &= Known2.One;
28100 Known.Zero &= Known2.Zero;
28103 case X86ISD::UDIVREM8_ZEXT_HREG:
28104 // TODO: Support more than just the zero extended bits?
28105 if (Op.getResNo() != 1)
28107 // The remainder is zero extended.
28108 Known.Zero.setBitsFrom(8);
28113 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
28114 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
28115 unsigned Depth) const {
28116 unsigned VTBits = Op.getScalarValueSizeInBits();
28117 unsigned Opcode = Op.getOpcode();
28119 case X86ISD::SETCC_CARRY:
28120 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
28123 case X86ISD::VSEXT: {
28124 // TODO: Add DemandedElts support.
28125 SDValue Src = Op.getOperand(0);
28126 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
28127 Tmp += VTBits - Src.getScalarValueSizeInBits();
28131 case X86ISD::VTRUNC: {
28132 // TODO: Add DemandedElts support.
28133 SDValue Src = Op.getOperand(0);
28134 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
28135 assert(VTBits < NumSrcBits && "Illegal truncation input type");
28136 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
28137 if (Tmp > (NumSrcBits - VTBits))
28138 return Tmp - (NumSrcBits - VTBits);
28142 case X86ISD::PACKSS: {
28143 // PACKSS is just a truncation if the sign bits extend to the packed size.
28144 // TODO: Add DemandedElts support.
28145 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
28146 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
28147 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
28148 unsigned Tmp = std::min(Tmp0, Tmp1);
28149 if (Tmp > (SrcBits - VTBits))
28150 return Tmp - (SrcBits - VTBits);
28154 case X86ISD::VSHLI: {
28155 SDValue Src = Op.getOperand(0);
28156 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
28157 if (ShiftVal.uge(VTBits))
28158 return VTBits; // Shifted all bits out --> zero.
28159 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
28160 if (ShiftVal.uge(Tmp))
28161 return 1; // Shifted all sign bits out --> unknown.
28162 return Tmp - ShiftVal.getZExtValue();
28165 case X86ISD::VSRAI: {
28166 SDValue Src = Op.getOperand(0);
28167 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
28168 if (ShiftVal.uge(VTBits - 1))
28169 return VTBits; // Sign splat.
28170 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
28172 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
28175 case X86ISD::PCMPGT:
28176 case X86ISD::PCMPEQ:
28178 case X86ISD::VPCOM:
28179 case X86ISD::VPCOMU:
28180 // Vector compares return zero/all-bits result values.
28183 case X86ISD::CMOV: {
28184 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
28185 if (Tmp0 == 1) return 1; // Early out.
28186 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
28187 return std::min(Tmp0, Tmp1);
28189 case X86ISD::SDIVREM8_SEXT_HREG:
28190 // TODO: Support more than just the sign extended bits?
28191 if (Op.getResNo() != 1)
28193 // The remainder is sign extended.
28201 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
28202 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
28203 return N->getOperand(0);
28207 /// Returns true (and the GlobalValue and the offset) if the node is a
28208 /// GlobalAddress + offset.
28209 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
28210 const GlobalValue* &GA,
28211 int64_t &Offset) const {
28212 if (N->getOpcode() == X86ISD::Wrapper) {
28213 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
28214 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
28215 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
28219 return TargetLowering::isGAPlusOffset(N, GA, Offset);
28222 // Attempt to match a combined shuffle mask against supported unary shuffle
28224 // TODO: Investigate sharing more of this with shuffle lowering.
28225 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28226 bool AllowFloatDomain, bool AllowIntDomain,
28227 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
28228 const X86Subtarget &Subtarget,
28229 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
28230 unsigned NumMaskElts = Mask.size();
28231 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
28233 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
28234 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
28235 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
28236 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
28237 unsigned MaxScale = 64 / MaskEltSize;
28238 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
28240 unsigned NumDstElts = NumMaskElts / Scale;
28241 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
28242 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
28243 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
28246 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
28247 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
28248 MVT::getIntegerVT(MaskEltSize);
28249 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
28251 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
28252 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
28253 Shuffle = unsigned(X86ISD::VZEXT);
28255 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
28257 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
28258 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
28264 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
28265 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
28266 isUndefOrEqual(Mask[0], 0) &&
28267 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
28268 Shuffle = X86ISD::VZEXT_MOVL;
28269 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
28273 // Check if we have SSE3 which will let us use MOVDDUP etc. The
28274 // instructions are no slower than UNPCKLPD but has the option to
28275 // fold the input operand into even an unaligned memory load.
28276 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
28277 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
28278 Shuffle = X86ISD::MOVDDUP;
28279 SrcVT = DstVT = MVT::v2f64;
28282 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
28283 Shuffle = X86ISD::MOVSLDUP;
28284 SrcVT = DstVT = MVT::v4f32;
28287 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
28288 Shuffle = X86ISD::MOVSHDUP;
28289 SrcVT = DstVT = MVT::v4f32;
28294 if (MaskVT.is256BitVector() && AllowFloatDomain) {
28295 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
28296 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
28297 Shuffle = X86ISD::MOVDDUP;
28298 SrcVT = DstVT = MVT::v4f64;
28301 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
28302 Shuffle = X86ISD::MOVSLDUP;
28303 SrcVT = DstVT = MVT::v8f32;
28306 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
28307 Shuffle = X86ISD::MOVSHDUP;
28308 SrcVT = DstVT = MVT::v8f32;
28313 if (MaskVT.is512BitVector() && AllowFloatDomain) {
28314 assert(Subtarget.hasAVX512() &&
28315 "AVX512 required for 512-bit vector shuffles");
28316 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
28317 Shuffle = X86ISD::MOVDDUP;
28318 SrcVT = DstVT = MVT::v8f64;
28321 if (isTargetShuffleEquivalent(
28322 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
28323 Shuffle = X86ISD::MOVSLDUP;
28324 SrcVT = DstVT = MVT::v16f32;
28327 if (isTargetShuffleEquivalent(
28328 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
28329 Shuffle = X86ISD::MOVSHDUP;
28330 SrcVT = DstVT = MVT::v16f32;
28335 // Attempt to match against broadcast-from-vector.
28336 if (Subtarget.hasAVX2()) {
28337 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
28338 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
28339 SrcVT = DstVT = MaskVT;
28340 Shuffle = X86ISD::VBROADCAST;
28348 // Attempt to match a combined shuffle mask against supported unary immediate
28349 // permute instructions.
28350 // TODO: Investigate sharing more of this with shuffle lowering.
28351 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28352 const APInt &Zeroable,
28353 bool AllowFloatDomain,
28354 bool AllowIntDomain,
28355 const X86Subtarget &Subtarget,
28356 unsigned &Shuffle, MVT &ShuffleVT,
28357 unsigned &PermuteImm) {
28358 unsigned NumMaskElts = Mask.size();
28359 unsigned InputSizeInBits = MaskVT.getSizeInBits();
28360 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
28361 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
28363 bool ContainsZeros =
28364 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
28366 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
28367 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
28368 // Check for lane crossing permutes.
28369 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
28370 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
28371 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
28372 Shuffle = X86ISD::VPERMI;
28373 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
28374 PermuteImm = getV4X86ShuffleImm(Mask);
28377 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
28378 SmallVector<int, 4> RepeatedMask;
28379 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
28380 Shuffle = X86ISD::VPERMI;
28381 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
28382 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
28386 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
28387 // VPERMILPD can permute with a non-repeating shuffle.
28388 Shuffle = X86ISD::VPERMILPI;
28389 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
28391 for (int i = 0, e = Mask.size(); i != e; ++i) {
28393 if (M == SM_SentinelUndef)
28395 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
28396 PermuteImm |= (M & 1) << i;
28402 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
28403 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
28404 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
28405 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
28406 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
28407 SmallVector<int, 4> RepeatedMask;
28408 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
28409 // Narrow the repeated mask to create 32-bit element permutes.
28410 SmallVector<int, 4> WordMask = RepeatedMask;
28411 if (MaskScalarSizeInBits == 64)
28412 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
28414 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
28415 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
28416 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
28417 PermuteImm = getV4X86ShuffleImm(WordMask);
28422 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
28423 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
28424 SmallVector<int, 4> RepeatedMask;
28425 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
28426 ArrayRef<int> LoMask(Mask.data() + 0, 4);
28427 ArrayRef<int> HiMask(Mask.data() + 4, 4);
28429 // PSHUFLW: permute lower 4 elements only.
28430 if (isUndefOrInRange(LoMask, 0, 4) &&
28431 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
28432 Shuffle = X86ISD::PSHUFLW;
28433 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
28434 PermuteImm = getV4X86ShuffleImm(LoMask);
28438 // PSHUFHW: permute upper 4 elements only.
28439 if (isUndefOrInRange(HiMask, 4, 8) &&
28440 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
28441 // Offset the HiMask so that we can create the shuffle immediate.
28442 int OffsetHiMask[4];
28443 for (int i = 0; i != 4; ++i)
28444 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
28446 Shuffle = X86ISD::PSHUFHW;
28447 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
28448 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
28454 // Attempt to match against byte/bit shifts.
28455 // FIXME: Add 512-bit support.
28456 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28457 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28458 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
28459 MaskScalarSizeInBits, Mask,
28460 0, Zeroable, Subtarget);
28461 if (0 < ShiftAmt) {
28462 PermuteImm = (unsigned)ShiftAmt;
28470 // Attempt to match a combined unary shuffle mask against supported binary
28471 // shuffle instructions.
28472 // TODO: Investigate sharing more of this with shuffle lowering.
28473 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28474 bool AllowFloatDomain, bool AllowIntDomain,
28475 SDValue &V1, SDValue &V2, SDLoc &DL,
28477 const X86Subtarget &Subtarget,
28478 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
28480 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28482 if (MaskVT.is128BitVector()) {
28483 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
28485 Shuffle = X86ISD::MOVLHPS;
28486 SrcVT = DstVT = MVT::v4f32;
28489 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
28491 Shuffle = X86ISD::MOVHLPS;
28492 SrcVT = DstVT = MVT::v4f32;
28495 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
28496 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28498 Shuffle = X86ISD::MOVSD;
28499 SrcVT = DstVT = MaskVT;
28502 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
28503 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28504 Shuffle = X86ISD::MOVSS;
28505 SrcVT = DstVT = MaskVT;
28510 // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
28511 // TODO add support for 256/512-bit types.
28512 if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
28513 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
28520 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
28521 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
28522 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28523 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
28524 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
28525 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
28526 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
28528 SrcVT = DstVT = MaskVT;
28529 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
28530 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
28538 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28539 const APInt &Zeroable,
28540 bool AllowFloatDomain,
28541 bool AllowIntDomain,
28542 SDValue &V1, SDValue &V2, SDLoc &DL,
28544 const X86Subtarget &Subtarget,
28545 unsigned &Shuffle, MVT &ShuffleVT,
28546 unsigned &PermuteImm) {
28547 unsigned NumMaskElts = Mask.size();
28548 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28550 // Attempt to match against PALIGNR byte rotate.
28551 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
28552 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28553 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
28554 if (0 < ByteRotation) {
28555 Shuffle = X86ISD::PALIGNR;
28556 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
28557 PermuteImm = ByteRotation;
28562 // Attempt to combine to X86ISD::BLENDI.
28563 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
28564 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
28565 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
28566 uint64_t BlendMask = 0;
28567 bool ForceV1Zero = false, ForceV2Zero = false;
28568 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
28569 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
28571 if (MaskVT == MVT::v16i16) {
28572 // We can only use v16i16 PBLENDW if the lanes are repeated.
28573 SmallVector<int, 8> RepeatedMask;
28574 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
28576 assert(RepeatedMask.size() == 8 &&
28577 "Repeated mask size doesn't match!");
28579 for (int i = 0; i < 8; ++i)
28580 if (RepeatedMask[i] >= 8)
28581 PermuteImm |= 1 << i;
28582 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28583 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28584 Shuffle = X86ISD::BLENDI;
28585 ShuffleVT = MaskVT;
28589 // Determine a type compatible with X86ISD::BLENDI.
28590 ShuffleVT = MaskVT;
28591 if (Subtarget.hasAVX2()) {
28592 if (ShuffleVT == MVT::v4i64)
28593 ShuffleVT = MVT::v8i32;
28594 else if (ShuffleVT == MVT::v2i64)
28595 ShuffleVT = MVT::v4i32;
28597 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
28598 ShuffleVT = MVT::v8i16;
28599 else if (ShuffleVT == MVT::v4i64)
28600 ShuffleVT = MVT::v4f64;
28601 else if (ShuffleVT == MVT::v8i32)
28602 ShuffleVT = MVT::v8f32;
28605 if (!ShuffleVT.isFloatingPoint()) {
28606 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
28608 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
28609 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
28610 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
28613 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28614 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28615 PermuteImm = (unsigned)BlendMask;
28616 Shuffle = X86ISD::BLENDI;
28622 // Attempt to combine to INSERTPS.
28623 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
28624 MaskVT.is128BitVector()) {
28625 if (Zeroable.getBoolValue() &&
28626 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
28627 Shuffle = X86ISD::INSERTPS;
28628 ShuffleVT = MVT::v4f32;
28633 // Attempt to combine to SHUFPD.
28634 if (AllowFloatDomain && EltSizeInBits == 64 &&
28635 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28636 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28637 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28638 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
28639 Shuffle = X86ISD::SHUFP;
28640 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
28645 // Attempt to combine to SHUFPS.
28646 if (AllowFloatDomain && EltSizeInBits == 32 &&
28647 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
28648 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28649 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28650 SmallVector<int, 4> RepeatedMask;
28651 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
28652 // Match each half of the repeated mask, to determine if its just
28653 // referencing one of the vectors, is zeroable or entirely undef.
28654 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
28655 int M0 = RepeatedMask[Offset];
28656 int M1 = RepeatedMask[Offset + 1];
28658 if (isUndefInRange(RepeatedMask, Offset, 2)) {
28659 return DAG.getUNDEF(MaskVT);
28660 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
28661 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
28662 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
28663 return getZeroVector(MaskVT, Subtarget, DAG, DL);
28664 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
28665 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
28666 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
28668 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
28669 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
28670 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
28677 int ShufMask[4] = {-1, -1, -1, -1};
28678 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
28679 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
28684 Shuffle = X86ISD::SHUFP;
28685 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
28686 PermuteImm = getV4X86ShuffleImm(ShufMask);
28695 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
28698 /// This is the leaf of the recursive combine below. When we have found some
28699 /// chain of single-use x86 shuffle instructions and accumulated the combined
28700 /// shuffle mask represented by them, this will try to pattern match that mask
28701 /// into either a single instruction if there is a special purpose instruction
28702 /// for this operation, or into a PSHUFB instruction which is a fully general
28703 /// instruction but should only be used to replace chains over a certain depth.
28704 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
28705 ArrayRef<int> BaseMask, int Depth,
28706 bool HasVariableMask, SelectionDAG &DAG,
28707 TargetLowering::DAGCombinerInfo &DCI,
28708 const X86Subtarget &Subtarget) {
28709 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
28710 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
28711 "Unexpected number of shuffle inputs!");
28713 // Find the inputs that enter the chain. Note that multiple uses are OK
28714 // here, we're not going to remove the operands we find.
28715 bool UnaryShuffle = (Inputs.size() == 1);
28716 SDValue V1 = peekThroughBitcasts(Inputs[0]);
28717 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
28718 : peekThroughBitcasts(Inputs[1]));
28720 MVT VT1 = V1.getSimpleValueType();
28721 MVT VT2 = V2.getSimpleValueType();
28722 MVT RootVT = Root.getSimpleValueType();
28723 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
28724 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
28725 "Vector size mismatch");
28730 unsigned NumBaseMaskElts = BaseMask.size();
28731 if (NumBaseMaskElts == 1) {
28732 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
28733 return DAG.getBitcast(RootVT, V1);
28736 unsigned RootSizeInBits = RootVT.getSizeInBits();
28737 unsigned NumRootElts = RootVT.getVectorNumElements();
28738 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
28739 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
28740 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
28742 // Don't combine if we are a AVX512/EVEX target and the mask element size
28743 // is different from the root element size - this would prevent writemasks
28744 // from being reused.
28745 // TODO - this currently prevents all lane shuffles from occurring.
28746 // TODO - check for writemasks usage instead of always preventing combining.
28747 // TODO - attempt to narrow Mask back to writemask size.
28748 bool IsEVEXShuffle =
28749 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
28751 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
28753 // Handle 128-bit lane shuffles of 256-bit vectors.
28754 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
28755 // we need to use the zeroing feature.
28756 // TODO - this should support binary shuffles.
28757 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
28758 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
28759 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
28760 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
28761 return SDValue(); // Nothing to do!
28762 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
28763 unsigned PermMask = 0;
28764 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
28765 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
28767 Res = DAG.getBitcast(ShuffleVT, V1);
28768 DCI.AddToWorklist(Res.getNode());
28769 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
28770 DAG.getUNDEF(ShuffleVT),
28771 DAG.getConstant(PermMask, DL, MVT::i8));
28772 DCI.AddToWorklist(Res.getNode());
28773 return DAG.getBitcast(RootVT, Res);
28776 // For masks that have been widened to 128-bit elements or more,
28777 // narrow back down to 64-bit elements.
28778 SmallVector<int, 64> Mask;
28779 if (BaseMaskEltSizeInBits > 64) {
28780 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
28781 int MaskScale = BaseMaskEltSizeInBits / 64;
28782 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
28784 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
28787 unsigned NumMaskElts = Mask.size();
28788 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
28790 // Determine the effective mask value type.
28791 FloatDomain &= (32 <= MaskEltSizeInBits);
28792 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
28793 : MVT::getIntegerVT(MaskEltSizeInBits);
28794 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
28796 // Only allow legal mask types.
28797 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
28800 // Attempt to match the mask against known shuffle patterns.
28801 MVT ShuffleSrcVT, ShuffleVT;
28802 unsigned Shuffle, PermuteImm;
28804 // Which shuffle domains are permitted?
28805 // Permit domain crossing at higher combine depths.
28806 bool AllowFloatDomain = FloatDomain || (Depth > 3);
28807 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
28808 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
28810 // Determine zeroable mask elements.
28811 APInt Zeroable(NumMaskElts, 0);
28812 for (unsigned i = 0; i != NumMaskElts; ++i)
28813 if (isUndefOrZero(Mask[i]))
28814 Zeroable.setBit(i);
28816 if (UnaryShuffle) {
28817 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
28818 // directly if we don't shuffle the lower element and we shuffle the upper
28819 // (zero) elements within themselves.
28820 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
28821 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
28822 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
28823 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
28824 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
28825 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
28826 return DAG.getBitcast(RootVT, V1);
28830 SDValue NewV1 = V1; // Save operand in case early exit happens.
28831 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
28832 NewV1, DL, DAG, Subtarget, Shuffle,
28833 ShuffleSrcVT, ShuffleVT) &&
28834 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
28835 if (Depth == 1 && Root.getOpcode() == Shuffle)
28836 return SDValue(); // Nothing to do!
28837 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
28838 DCI.AddToWorklist(Res.getNode());
28839 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
28840 DCI.AddToWorklist(Res.getNode());
28841 return DAG.getBitcast(RootVT, Res);
28844 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
28845 AllowIntDomain, Subtarget, Shuffle,
28846 ShuffleVT, PermuteImm) &&
28847 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
28848 if (Depth == 1 && Root.getOpcode() == Shuffle)
28849 return SDValue(); // Nothing to do!
28850 Res = DAG.getBitcast(ShuffleVT, V1);
28851 DCI.AddToWorklist(Res.getNode());
28852 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
28853 DAG.getConstant(PermuteImm, DL, MVT::i8));
28854 DCI.AddToWorklist(Res.getNode());
28855 return DAG.getBitcast(RootVT, Res);
28859 SDValue NewV1 = V1; // Save operands in case early exit happens.
28860 SDValue NewV2 = V2;
28861 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
28862 NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
28863 ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
28864 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
28865 if (Depth == 1 && Root.getOpcode() == Shuffle)
28866 return SDValue(); // Nothing to do!
28867 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
28868 DCI.AddToWorklist(NewV1.getNode());
28869 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
28870 DCI.AddToWorklist(NewV2.getNode());
28871 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
28872 DCI.AddToWorklist(Res.getNode());
28873 return DAG.getBitcast(RootVT, Res);
28876 NewV1 = V1; // Save operands in case early exit happens.
28878 if (matchBinaryPermuteVectorShuffle(
28879 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
28880 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
28881 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
28882 if (Depth == 1 && Root.getOpcode() == Shuffle)
28883 return SDValue(); // Nothing to do!
28884 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
28885 DCI.AddToWorklist(NewV1.getNode());
28886 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
28887 DCI.AddToWorklist(NewV2.getNode());
28888 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
28889 DAG.getConstant(PermuteImm, DL, MVT::i8));
28890 DCI.AddToWorklist(Res.getNode());
28891 return DAG.getBitcast(RootVT, Res);
28894 // Typically from here on, we need an integer version of MaskVT.
28895 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
28896 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
28898 // Annoyingly, SSE4A instructions don't map into the above match helpers.
28899 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
28900 uint64_t BitLen, BitIdx;
28901 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
28903 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
28904 return SDValue(); // Nothing to do!
28905 V1 = DAG.getBitcast(IntMaskVT, V1);
28906 DCI.AddToWorklist(V1.getNode());
28907 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
28908 DAG.getConstant(BitLen, DL, MVT::i8),
28909 DAG.getConstant(BitIdx, DL, MVT::i8));
28910 DCI.AddToWorklist(Res.getNode());
28911 return DAG.getBitcast(RootVT, Res);
28914 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
28915 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
28916 return SDValue(); // Nothing to do!
28917 V1 = DAG.getBitcast(IntMaskVT, V1);
28918 DCI.AddToWorklist(V1.getNode());
28919 V2 = DAG.getBitcast(IntMaskVT, V2);
28920 DCI.AddToWorklist(V2.getNode());
28921 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
28922 DAG.getConstant(BitLen, DL, MVT::i8),
28923 DAG.getConstant(BitIdx, DL, MVT::i8));
28924 DCI.AddToWorklist(Res.getNode());
28925 return DAG.getBitcast(RootVT, Res);
28929 // Don't try to re-form single instruction chains under any circumstances now
28930 // that we've done encoding canonicalization for them.
28934 // Depth threshold above which we can efficiently use variable mask shuffles.
28935 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
28936 bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
28938 bool MaskContainsZeros =
28939 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
28941 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
28942 // If we have a single input lane-crossing shuffle then lower to VPERMV.
28943 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
28944 ((Subtarget.hasAVX2() &&
28945 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28946 (Subtarget.hasAVX512() &&
28947 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28948 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28949 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28950 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28951 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28952 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28953 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28954 DCI.AddToWorklist(VPermMask.getNode());
28955 Res = DAG.getBitcast(MaskVT, V1);
28956 DCI.AddToWorklist(Res.getNode());
28957 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
28958 DCI.AddToWorklist(Res.getNode());
28959 return DAG.getBitcast(RootVT, Res);
28962 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
28963 // vector as the second source.
28964 if (UnaryShuffle && AllowVariableMask &&
28965 ((Subtarget.hasAVX512() &&
28966 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28967 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28968 (Subtarget.hasVLX() &&
28969 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
28970 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28971 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28972 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28973 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28974 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28975 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
28976 for (unsigned i = 0; i != NumMaskElts; ++i)
28977 if (Mask[i] == SM_SentinelZero)
28978 Mask[i] = NumMaskElts + i;
28980 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28981 DCI.AddToWorklist(VPermMask.getNode());
28982 Res = DAG.getBitcast(MaskVT, V1);
28983 DCI.AddToWorklist(Res.getNode());
28984 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
28985 DCI.AddToWorklist(Zero.getNode());
28986 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
28987 DCI.AddToWorklist(Res.getNode());
28988 return DAG.getBitcast(RootVT, Res);
28991 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
28992 if (AllowVariableMask && !MaskContainsZeros &&
28993 ((Subtarget.hasAVX512() &&
28994 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28995 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28996 (Subtarget.hasVLX() &&
28997 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
28998 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28999 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29000 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29001 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29002 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29003 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29004 DCI.AddToWorklist(VPermMask.getNode());
29005 V1 = DAG.getBitcast(MaskVT, V1);
29006 DCI.AddToWorklist(V1.getNode());
29007 V2 = DAG.getBitcast(MaskVT, V2);
29008 DCI.AddToWorklist(V2.getNode());
29009 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
29010 DCI.AddToWorklist(Res.getNode());
29011 return DAG.getBitcast(RootVT, Res);
29016 // See if we can combine a single input shuffle with zeros to a bit-mask,
29017 // which is much simpler than any shuffle.
29018 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
29019 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
29020 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
29021 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
29022 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
29023 APInt UndefElts(NumMaskElts, 0);
29024 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
29025 for (unsigned i = 0; i != NumMaskElts; ++i) {
29027 if (M == SM_SentinelUndef) {
29028 UndefElts.setBit(i);
29031 if (M == SM_SentinelZero)
29033 EltBits[i] = AllOnes;
29035 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
29036 DCI.AddToWorklist(BitMask.getNode());
29037 Res = DAG.getBitcast(MaskVT, V1);
29038 DCI.AddToWorklist(Res.getNode());
29039 unsigned AndOpcode =
29040 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
29041 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
29042 DCI.AddToWorklist(Res.getNode());
29043 return DAG.getBitcast(RootVT, Res);
29046 // If we have a single input shuffle with different shuffle patterns in the
29047 // the 128-bit lanes use the variable mask to VPERMILPS.
29048 // TODO Combine other mask types at higher depths.
29049 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29050 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
29051 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
29052 SmallVector<SDValue, 16> VPermIdx;
29053 for (int M : Mask) {
29055 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
29056 VPermIdx.push_back(Idx);
29058 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
29059 DCI.AddToWorklist(VPermMask.getNode());
29060 Res = DAG.getBitcast(MaskVT, V1);
29061 DCI.AddToWorklist(Res.getNode());
29062 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
29063 DCI.AddToWorklist(Res.getNode());
29064 return DAG.getBitcast(RootVT, Res);
29067 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
29068 // to VPERMIL2PD/VPERMIL2PS.
29069 if (AllowVariableMask && Subtarget.hasXOP() &&
29070 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
29071 MaskVT == MVT::v8f32)) {
29072 // VPERMIL2 Operation.
29073 // Bits[3] - Match Bit.
29074 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
29075 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
29076 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
29077 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
29078 SmallVector<int, 8> VPerm2Idx;
29079 unsigned M2ZImm = 0;
29080 for (int M : Mask) {
29081 if (M == SM_SentinelUndef) {
29082 VPerm2Idx.push_back(-1);
29085 if (M == SM_SentinelZero) {
29087 VPerm2Idx.push_back(8);
29090 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
29091 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
29092 VPerm2Idx.push_back(Index);
29094 V1 = DAG.getBitcast(MaskVT, V1);
29095 DCI.AddToWorklist(V1.getNode());
29096 V2 = DAG.getBitcast(MaskVT, V2);
29097 DCI.AddToWorklist(V2.getNode());
29098 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
29099 DCI.AddToWorklist(VPerm2MaskOp.getNode());
29100 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
29101 DAG.getConstant(M2ZImm, DL, MVT::i8));
29102 DCI.AddToWorklist(Res.getNode());
29103 return DAG.getBitcast(RootVT, Res);
29106 // If we have 3 or more shuffle instructions or a chain involving a variable
29107 // mask, we can replace them with a single PSHUFB instruction profitably.
29108 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
29109 // instructions, but in practice PSHUFB tends to be *very* fast so we're
29110 // more aggressive.
29111 if (UnaryShuffle && AllowVariableMask &&
29112 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29113 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
29114 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
29115 SmallVector<SDValue, 16> PSHUFBMask;
29116 int NumBytes = RootVT.getSizeInBits() / 8;
29117 int Ratio = NumBytes / NumMaskElts;
29118 for (int i = 0; i < NumBytes; ++i) {
29119 int M = Mask[i / Ratio];
29120 if (M == SM_SentinelUndef) {
29121 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
29124 if (M == SM_SentinelZero) {
29125 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
29128 M = Ratio * M + i % Ratio;
29129 assert((M / 16) == (i / 16) && "Lane crossing detected");
29130 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
29132 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
29133 Res = DAG.getBitcast(ByteVT, V1);
29134 DCI.AddToWorklist(Res.getNode());
29135 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
29136 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
29137 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
29138 DCI.AddToWorklist(Res.getNode());
29139 return DAG.getBitcast(RootVT, Res);
29142 // With XOP, if we have a 128-bit binary input shuffle we can always combine
29143 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
29144 // slower than PSHUFB on targets that support both.
29145 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
29146 // VPPERM Mask Operation
29147 // Bits[4:0] - Byte Index (0 - 31)
29148 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
29149 SmallVector<SDValue, 16> VPPERMMask;
29151 int Ratio = NumBytes / NumMaskElts;
29152 for (int i = 0; i < NumBytes; ++i) {
29153 int M = Mask[i / Ratio];
29154 if (M == SM_SentinelUndef) {
29155 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
29158 if (M == SM_SentinelZero) {
29159 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
29162 M = Ratio * M + i % Ratio;
29163 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
29165 MVT ByteVT = MVT::v16i8;
29166 V1 = DAG.getBitcast(ByteVT, V1);
29167 DCI.AddToWorklist(V1.getNode());
29168 V2 = DAG.getBitcast(ByteVT, V2);
29169 DCI.AddToWorklist(V2.getNode());
29170 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
29171 DCI.AddToWorklist(VPPERMMaskOp.getNode());
29172 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
29173 DCI.AddToWorklist(Res.getNode());
29174 return DAG.getBitcast(RootVT, Res);
29177 // Failed to find any combines.
29181 // Attempt to constant fold all of the constant source ops.
29182 // Returns true if the entire shuffle is folded to a constant.
29183 // TODO: Extend this to merge multiple constant Ops and update the mask.
29184 static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
29185 ArrayRef<int> Mask, SDValue Root,
29186 bool HasVariableMask,
29188 TargetLowering::DAGCombinerInfo &DCI,
29189 const X86Subtarget &Subtarget) {
29190 MVT VT = Root.getSimpleValueType();
29192 unsigned SizeInBits = VT.getSizeInBits();
29193 unsigned NumMaskElts = Mask.size();
29194 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
29195 unsigned NumOps = Ops.size();
29197 // Extract constant bits from each source op.
29198 bool OneUseConstantOp = false;
29199 SmallVector<APInt, 16> UndefEltsOps(NumOps);
29200 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
29201 for (unsigned i = 0; i != NumOps; ++i) {
29202 SDValue SrcOp = Ops[i];
29203 OneUseConstantOp |= SrcOp.hasOneUse();
29204 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
29209 // Only fold if at least one of the constants is only used once or
29210 // the combined shuffle has included a variable mask shuffle, this
29211 // is to avoid constant pool bloat.
29212 if (!OneUseConstantOp && !HasVariableMask)
29215 // Shuffle the constant bits according to the mask.
29216 APInt UndefElts(NumMaskElts, 0);
29217 APInt ZeroElts(NumMaskElts, 0);
29218 APInt ConstantElts(NumMaskElts, 0);
29219 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
29220 APInt::getNullValue(MaskSizeInBits));
29221 for (unsigned i = 0; i != NumMaskElts; ++i) {
29223 if (M == SM_SentinelUndef) {
29224 UndefElts.setBit(i);
29226 } else if (M == SM_SentinelZero) {
29227 ZeroElts.setBit(i);
29230 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
29232 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
29233 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
29235 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
29236 if (SrcUndefElts[SrcMaskIdx]) {
29237 UndefElts.setBit(i);
29241 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
29242 APInt &Bits = SrcEltBits[SrcMaskIdx];
29244 ZeroElts.setBit(i);
29248 ConstantElts.setBit(i);
29249 ConstantBitData[i] = Bits;
29251 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
29253 // Create the constant data.
29255 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
29256 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
29258 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
29260 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
29263 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
29264 DCI.AddToWorklist(CstOp.getNode());
29265 return DAG.getBitcast(VT, CstOp);
29268 /// \brief Fully generic combining of x86 shuffle instructions.
29270 /// This should be the last combine run over the x86 shuffle instructions. Once
29271 /// they have been fully optimized, this will recursively consider all chains
29272 /// of single-use shuffle instructions, build a generic model of the cumulative
29273 /// shuffle operation, and check for simpler instructions which implement this
29274 /// operation. We use this primarily for two purposes:
29276 /// 1) Collapse generic shuffles to specialized single instructions when
29277 /// equivalent. In most cases, this is just an encoding size win, but
29278 /// sometimes we will collapse multiple generic shuffles into a single
29279 /// special-purpose shuffle.
29280 /// 2) Look for sequences of shuffle instructions with 3 or more total
29281 /// instructions, and replace them with the slightly more expensive SSSE3
29282 /// PSHUFB instruction if available. We do this as the last combining step
29283 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
29284 /// a suitable short sequence of other instructions. The PSHUFB will either
29285 /// use a register or have to read from memory and so is slightly (but only
29286 /// slightly) more expensive than the other shuffle instructions.
29288 /// Because this is inherently a quadratic operation (for each shuffle in
29289 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
29290 /// This should never be an issue in practice as the shuffle lowering doesn't
29291 /// produce sequences of more than 8 instructions.
29293 /// FIXME: We will currently miss some cases where the redundant shuffling
29294 /// would simplify under the threshold for PSHUFB formation because of
29295 /// combine-ordering. To fix this, we should do the redundant instruction
29296 /// combining in this recursive walk.
29297 static SDValue combineX86ShufflesRecursively(
29298 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
29299 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,
29300 bool HasVariableMask, SelectionDAG &DAG,
29301 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
29302 // Bound the depth of our recursive combine because this is ultimately
29303 // quadratic in nature.
29307 // Directly rip through bitcasts to find the underlying operand.
29308 SDValue Op = SrcOps[SrcOpIndex];
29309 Op = peekThroughOneUseBitcasts(Op);
29311 MVT VT = Op.getSimpleValueType();
29312 if (!VT.isVector())
29313 return SDValue(); // Bail if we hit a non-vector.
29315 assert(Root.getSimpleValueType().isVector() &&
29316 "Shuffles operate on vector types!");
29317 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
29318 "Can only combine shuffles of the same vector register size.");
29320 // Extract target shuffle mask and resolve sentinels and inputs.
29321 SmallVector<int, 64> OpMask;
29322 SmallVector<SDValue, 2> OpInputs;
29323 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
29326 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
29327 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
29328 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
29330 // Add the inputs to the Ops list, avoiding duplicates.
29331 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
29333 int InputIdx0 = -1, InputIdx1 = -1;
29334 for (int i = 0, e = Ops.size(); i < e; ++i) {
29335 SDValue BC = peekThroughBitcasts(Ops[i]);
29336 if (Input0 && BC == peekThroughBitcasts(Input0))
29338 if (Input1 && BC == peekThroughBitcasts(Input1))
29342 if (Input0 && InputIdx0 < 0) {
29343 InputIdx0 = SrcOpIndex;
29344 Ops[SrcOpIndex] = Input0;
29346 if (Input1 && InputIdx1 < 0) {
29347 InputIdx1 = Ops.size();
29348 Ops.push_back(Input1);
29351 assert(((RootMask.size() > OpMask.size() &&
29352 RootMask.size() % OpMask.size() == 0) ||
29353 (OpMask.size() > RootMask.size() &&
29354 OpMask.size() % RootMask.size() == 0) ||
29355 OpMask.size() == RootMask.size()) &&
29356 "The smaller number of elements must divide the larger.");
29358 // This function can be performance-critical, so we rely on the power-of-2
29359 // knowledge that we have about the mask sizes to replace div/rem ops with
29360 // bit-masks and shifts.
29361 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
29362 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
29363 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
29364 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
29366 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
29367 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
29368 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
29369 assert((RootRatio == 1 || OpRatio == 1) &&
29370 "Must not have a ratio for both incoming and op masks!");
29372 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
29373 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
29374 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
29375 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
29376 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
29378 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
29380 // Merge this shuffle operation's mask into our accumulated mask. Note that
29381 // this shuffle's mask will be the first applied to the input, followed by the
29382 // root mask to get us all the way to the root value arrangement. The reason
29383 // for this order is that we are recursing up the operation chain.
29384 for (unsigned i = 0; i < MaskWidth; ++i) {
29385 unsigned RootIdx = i >> RootRatioLog2;
29386 if (RootMask[RootIdx] < 0) {
29387 // This is a zero or undef lane, we're done.
29388 Mask[i] = RootMask[RootIdx];
29392 unsigned RootMaskedIdx =
29394 ? RootMask[RootIdx]
29395 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
29397 // Just insert the scaled root mask value if it references an input other
29398 // than the SrcOp we're currently inserting.
29399 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
29400 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
29401 Mask[i] = RootMaskedIdx;
29405 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
29406 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
29407 if (OpMask[OpIdx] < 0) {
29408 // The incoming lanes are zero or undef, it doesn't matter which ones we
29410 Mask[i] = OpMask[OpIdx];
29414 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
29415 unsigned OpMaskedIdx =
29418 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
29420 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
29421 if (OpMask[OpIdx] < (int)OpMask.size()) {
29422 assert(0 <= InputIdx0 && "Unknown target shuffle input");
29423 OpMaskedIdx += InputIdx0 * MaskWidth;
29425 assert(0 <= InputIdx1 && "Unknown target shuffle input");
29426 OpMaskedIdx += InputIdx1 * MaskWidth;
29429 Mask[i] = OpMaskedIdx;
29432 // Handle the all undef/zero cases early.
29433 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
29434 return DAG.getUNDEF(Root.getValueType());
29436 // TODO - should we handle the mixed zero/undef case as well? Just returning
29437 // a zero mask will lose information on undef elements possibly reducing
29438 // future combine possibilities.
29439 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
29440 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
29443 // Remove unused shuffle source ops.
29444 resolveTargetShuffleInputsAndMask(Ops, Mask);
29445 assert(!Ops.empty() && "Shuffle with no inputs detected");
29447 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
29449 // Update the list of shuffle nodes that have been combined so far.
29450 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
29452 CombinedNodes.push_back(Op.getNode());
29454 // See if we can recurse into each shuffle source op (if it's a target
29455 // shuffle). The source op should only be combined if it either has a
29456 // single use (i.e. current Op) or all its users have already been combined.
29457 for (int i = 0, e = Ops.size(); i < e; ++i)
29458 if (Ops[i].getNode()->hasOneUse() ||
29459 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
29460 if (SDValue Res = combineX86ShufflesRecursively(
29461 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
29462 DAG, DCI, Subtarget))
29465 // Attempt to constant fold all of the constant source ops.
29466 if (SDValue Cst = combineX86ShufflesConstants(
29467 Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
29470 // We can only combine unary and binary shuffle mask cases.
29471 if (Ops.size() > 2)
29474 // Minor canonicalization of the accumulated shuffle mask to make it easier
29475 // to match below. All this does is detect masks with sequential pairs of
29476 // elements, and shrink them to the half-width mask. It does this in a loop
29477 // so it will reduce the size of the mask to the minimal width mask which
29478 // performs an equivalent shuffle.
29479 SmallVector<int, 64> WidenedMask;
29480 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
29481 Mask = std::move(WidenedMask);
29484 // Canonicalization of binary shuffle masks to improve pattern matching by
29485 // commuting the inputs.
29486 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
29487 ShuffleVectorSDNode::commuteMask(Mask);
29488 std::swap(Ops[0], Ops[1]);
29491 // Finally, try to combine into a single shuffle instruction.
29492 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
29496 /// \brief Get the PSHUF-style mask from PSHUF node.
29498 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
29499 /// PSHUF-style masks that can be reused with such instructions.
29500 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
29501 MVT VT = N.getSimpleValueType();
29502 SmallVector<int, 4> Mask;
29503 SmallVector<SDValue, 2> Ops;
29506 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
29510 // If we have more than 128-bits, only the low 128-bits of shuffle mask
29511 // matter. Check that the upper masks are repeats and remove them.
29512 if (VT.getSizeInBits() > 128) {
29513 int LaneElts = 128 / VT.getScalarSizeInBits();
29515 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
29516 for (int j = 0; j < LaneElts; ++j)
29517 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
29518 "Mask doesn't repeat in high 128-bit lanes!");
29520 Mask.resize(LaneElts);
29523 switch (N.getOpcode()) {
29524 case X86ISD::PSHUFD:
29526 case X86ISD::PSHUFLW:
29529 case X86ISD::PSHUFHW:
29530 Mask.erase(Mask.begin(), Mask.begin() + 4);
29531 for (int &M : Mask)
29535 llvm_unreachable("No valid shuffle instruction found!");
29539 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
29541 /// We walk up the chain and look for a combinable shuffle, skipping over
29542 /// shuffles that we could hoist this shuffle's transformation past without
29543 /// altering anything.
29545 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
29546 SelectionDAG &DAG) {
29547 assert(N.getOpcode() == X86ISD::PSHUFD &&
29548 "Called with something other than an x86 128-bit half shuffle!");
29551 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
29552 // of the shuffles in the chain so that we can form a fresh chain to replace
29554 SmallVector<SDValue, 8> Chain;
29555 SDValue V = N.getOperand(0);
29556 for (; V.hasOneUse(); V = V.getOperand(0)) {
29557 switch (V.getOpcode()) {
29559 return SDValue(); // Nothing combined!
29562 // Skip bitcasts as we always know the type for the target specific
29566 case X86ISD::PSHUFD:
29567 // Found another dword shuffle.
29570 case X86ISD::PSHUFLW:
29571 // Check that the low words (being shuffled) are the identity in the
29572 // dword shuffle, and the high words are self-contained.
29573 if (Mask[0] != 0 || Mask[1] != 1 ||
29574 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
29577 Chain.push_back(V);
29580 case X86ISD::PSHUFHW:
29581 // Check that the high words (being shuffled) are the identity in the
29582 // dword shuffle, and the low words are self-contained.
29583 if (Mask[2] != 2 || Mask[3] != 3 ||
29584 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
29587 Chain.push_back(V);
29590 case X86ISD::UNPCKL:
29591 case X86ISD::UNPCKH:
29592 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
29593 // shuffle into a preceding word shuffle.
29594 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
29595 V.getSimpleValueType().getVectorElementType() != MVT::i16)
29598 // Search for a half-shuffle which we can combine with.
29599 unsigned CombineOp =
29600 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
29601 if (V.getOperand(0) != V.getOperand(1) ||
29602 !V->isOnlyUserOf(V.getOperand(0).getNode()))
29604 Chain.push_back(V);
29605 V = V.getOperand(0);
29607 switch (V.getOpcode()) {
29609 return SDValue(); // Nothing to combine.
29611 case X86ISD::PSHUFLW:
29612 case X86ISD::PSHUFHW:
29613 if (V.getOpcode() == CombineOp)
29616 Chain.push_back(V);
29620 V = V.getOperand(0);
29624 } while (V.hasOneUse());
29627 // Break out of the loop if we break out of the switch.
29631 if (!V.hasOneUse())
29632 // We fell out of the loop without finding a viable combining instruction.
29635 // Merge this node's mask and our incoming mask.
29636 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29637 for (int &M : Mask)
29639 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
29640 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29642 // Rebuild the chain around this new shuffle.
29643 while (!Chain.empty()) {
29644 SDValue W = Chain.pop_back_val();
29646 if (V.getValueType() != W.getOperand(0).getValueType())
29647 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
29649 switch (W.getOpcode()) {
29651 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
29653 case X86ISD::UNPCKL:
29654 case X86ISD::UNPCKH:
29655 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
29658 case X86ISD::PSHUFD:
29659 case X86ISD::PSHUFLW:
29660 case X86ISD::PSHUFHW:
29661 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
29665 if (V.getValueType() != N.getValueType())
29666 V = DAG.getBitcast(N.getValueType(), V);
29668 // Return the new chain to replace N.
29672 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
29675 /// We walk up the chain, skipping shuffles of the other half and looking
29676 /// through shuffles which switch halves trying to find a shuffle of the same
29677 /// pair of dwords.
29678 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
29680 TargetLowering::DAGCombinerInfo &DCI) {
29682 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
29683 "Called with something other than an x86 128-bit half shuffle!");
29685 unsigned CombineOpcode = N.getOpcode();
29687 // Walk up a single-use chain looking for a combinable shuffle.
29688 SDValue V = N.getOperand(0);
29689 for (; V.hasOneUse(); V = V.getOperand(0)) {
29690 switch (V.getOpcode()) {
29692 return false; // Nothing combined!
29695 // Skip bitcasts as we always know the type for the target specific
29699 case X86ISD::PSHUFLW:
29700 case X86ISD::PSHUFHW:
29701 if (V.getOpcode() == CombineOpcode)
29704 // Other-half shuffles are no-ops.
29707 // Break out of the loop if we break out of the switch.
29711 if (!V.hasOneUse())
29712 // We fell out of the loop without finding a viable combining instruction.
29715 // Combine away the bottom node as its shuffle will be accumulated into
29716 // a preceding shuffle.
29717 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
29719 // Record the old value.
29722 // Merge this node's mask and our incoming mask (adjusted to account for all
29723 // the pshufd instructions encountered).
29724 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29725 for (int &M : Mask)
29727 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
29728 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29730 // Check that the shuffles didn't cancel each other out. If not, we need to
29731 // combine to the new one.
29733 // Replace the combinable shuffle with the combined one, updating all users
29734 // so that we re-evaluate the chain here.
29735 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
29740 /// \brief Try to combine x86 target specific shuffles.
29741 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
29742 TargetLowering::DAGCombinerInfo &DCI,
29743 const X86Subtarget &Subtarget) {
29745 MVT VT = N.getSimpleValueType();
29746 SmallVector<int, 4> Mask;
29747 unsigned Opcode = N.getOpcode();
29749 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
29750 // single instruction.
29751 if (VT.getScalarSizeInBits() == 64 &&
29752 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
29753 Opcode == X86ISD::UNPCKL)) {
29754 auto BC0 = peekThroughBitcasts(N.getOperand(0));
29755 auto BC1 = peekThroughBitcasts(N.getOperand(1));
29756 EVT VT0 = BC0.getValueType();
29757 EVT VT1 = BC1.getValueType();
29758 unsigned Opcode0 = BC0.getOpcode();
29759 unsigned Opcode1 = BC1.getOpcode();
29760 if (Opcode0 == Opcode1 && VT0 == VT1 &&
29761 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
29762 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
29763 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
29765 if (Opcode == X86ISD::MOVSD) {
29766 Lo = BC1.getOperand(0);
29767 Hi = BC0.getOperand(1);
29769 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
29770 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
29772 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
29773 DCI.AddToWorklist(Horiz.getNode());
29774 return DAG.getBitcast(VT, Horiz);
29779 case X86ISD::PSHUFD:
29780 case X86ISD::PSHUFLW:
29781 case X86ISD::PSHUFHW:
29782 Mask = getPSHUFShuffleMask(N);
29783 assert(Mask.size() == 4);
29785 case X86ISD::UNPCKL: {
29786 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
29787 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
29788 // moves upper half elements into the lower half part. For example:
29790 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
29792 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
29794 // will be combined to:
29796 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
29798 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
29799 // happen due to advanced instructions.
29800 if (!VT.is128BitVector())
29803 auto Op0 = N.getOperand(0);
29804 auto Op1 = N.getOperand(1);
29805 if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
29806 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
29808 unsigned NumElts = VT.getVectorNumElements();
29809 SmallVector<int, 8> ExpectedMask(NumElts, -1);
29810 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
29813 auto ShufOp = Op1.getOperand(0);
29814 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
29815 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
29819 case X86ISD::BLENDI: {
29820 SDValue V0 = N->getOperand(0);
29821 SDValue V1 = N->getOperand(1);
29822 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
29823 "Unexpected input vector types");
29825 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
29826 // operands and changing the mask to 1. This saves us a bunch of
29827 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
29828 // x86InstrInfo knows how to commute this back after instruction selection
29829 // if it would help register allocation.
29831 // TODO: If optimizing for size or a processor that doesn't suffer from
29832 // partial register update stalls, this should be transformed into a MOVSD
29833 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
29835 if (VT == MVT::v2f64)
29836 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
29837 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
29838 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
29839 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
29844 case X86ISD::MOVSD:
29845 case X86ISD::MOVSS: {
29846 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
29847 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
29848 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
29849 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
29850 if (isZero0 && isZero1)
29853 // We often lower to MOVSD/MOVSS from integer as well as native float
29854 // types; remove unnecessary domain-crossing bitcasts if we can to make it
29855 // easier to combine shuffles later on. We've already accounted for the
29856 // domain switching cost when we decided to lower with it.
29857 bool isFloat = VT.isFloatingPoint();
29858 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
29859 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
29860 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
29861 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
29862 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
29863 V0 = DAG.getBitcast(NewVT, V0);
29864 V1 = DAG.getBitcast(NewVT, V1);
29865 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
29870 case X86ISD::INSERTPS: {
29871 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
29872 SDValue Op0 = N.getOperand(0);
29873 SDValue Op1 = N.getOperand(1);
29874 SDValue Op2 = N.getOperand(2);
29875 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
29876 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
29877 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
29878 unsigned ZeroMask = InsertPSMask & 0xF;
29880 // If we zero out all elements from Op0 then we don't need to reference it.
29881 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
29882 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
29883 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29885 // If we zero out the element from Op1 then we don't need to reference it.
29886 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
29887 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
29888 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29890 // Attempt to merge insertps Op1 with an inner target shuffle node.
29891 SmallVector<int, 8> TargetMask1;
29892 SmallVector<SDValue, 2> Ops1;
29893 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
29894 int M = TargetMask1[SrcIdx];
29895 if (isUndefOrZero(M)) {
29896 // Zero/UNDEF insertion - zero out element and remove dependency.
29897 InsertPSMask |= (1u << DstIdx);
29898 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
29899 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29901 // Update insertps mask srcidx and reference the source input directly.
29902 assert(0 <= M && M < 8 && "Shuffle index out of range");
29903 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
29904 Op1 = Ops1[M < 4 ? 0 : 1];
29905 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
29906 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29909 // Attempt to merge insertps Op0 with an inner target shuffle node.
29910 SmallVector<int, 8> TargetMask0;
29911 SmallVector<SDValue, 2> Ops0;
29912 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
29915 bool Updated = false;
29916 bool UseInput00 = false;
29917 bool UseInput01 = false;
29918 for (int i = 0; i != 4; ++i) {
29919 int M = TargetMask0[i];
29920 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
29921 // No change if element is already zero or the inserted element.
29923 } else if (isUndefOrZero(M)) {
29924 // If the target mask is undef/zero then we must zero the element.
29925 InsertPSMask |= (1u << i);
29930 // The input vector element must be inline.
29931 if (M != i && M != (i + 4))
29934 // Determine which inputs of the target shuffle we're using.
29935 UseInput00 |= (0 <= M && M < 4);
29936 UseInput01 |= (4 <= M);
29939 // If we're not using both inputs of the target shuffle then use the
29940 // referenced input directly.
29941 if (UseInput00 && !UseInput01) {
29944 } else if (!UseInput00 && UseInput01) {
29950 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
29951 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29959 // Nuke no-op shuffles that show up after combining.
29960 if (isNoopShuffleMask(Mask))
29961 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
29963 // Look for simplifications involving one or two shuffle instructions.
29964 SDValue V = N.getOperand(0);
29965 switch (N.getOpcode()) {
29968 case X86ISD::PSHUFLW:
29969 case X86ISD::PSHUFHW:
29970 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
29972 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
29973 return SDValue(); // We combined away this shuffle, so we're done.
29975 // See if this reduces to a PSHUFD which is no more expensive and can
29976 // combine with more operations. Note that it has to at least flip the
29977 // dwords as otherwise it would have been removed as a no-op.
29978 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
29979 int DMask[] = {0, 1, 2, 3};
29980 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
29981 DMask[DOffset + 0] = DOffset + 1;
29982 DMask[DOffset + 1] = DOffset + 0;
29983 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
29984 V = DAG.getBitcast(DVT, V);
29985 DCI.AddToWorklist(V.getNode());
29986 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
29987 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
29988 DCI.AddToWorklist(V.getNode());
29989 return DAG.getBitcast(VT, V);
29992 // Look for shuffle patterns which can be implemented as a single unpack.
29993 // FIXME: This doesn't handle the location of the PSHUFD generically, and
29994 // only works when we have a PSHUFD followed by two half-shuffles.
29995 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
29996 (V.getOpcode() == X86ISD::PSHUFLW ||
29997 V.getOpcode() == X86ISD::PSHUFHW) &&
29998 V.getOpcode() != N.getOpcode() &&
30000 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
30001 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
30002 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30003 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
30004 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30005 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30007 for (int i = 0; i < 4; ++i) {
30008 WordMask[i + NOffset] = Mask[i] + NOffset;
30009 WordMask[i + VOffset] = VMask[i] + VOffset;
30011 // Map the word mask through the DWord mask.
30013 for (int i = 0; i < 8; ++i)
30014 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
30015 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
30016 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
30017 // We can replace all three shuffles with an unpack.
30018 V = DAG.getBitcast(VT, D.getOperand(0));
30019 DCI.AddToWorklist(V.getNode());
30020 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
30029 case X86ISD::PSHUFD:
30030 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
30039 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
30040 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
30041 /// are written to the parameters \p Opnd0 and \p Opnd1.
30043 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
30044 /// so it is easier to generically match. We also insert dummy vector shuffle
30045 /// nodes for the operands which explicitly discard the lanes which are unused
30046 /// by this operation to try to flow through the rest of the combiner
30047 /// the fact that they're unused.
30048 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
30049 SDValue &Opnd0, SDValue &Opnd1,
30050 bool matchSubAdd = false) {
30052 EVT VT = N->getValueType(0);
30053 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
30054 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
30055 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
30058 // We only handle target-independent shuffles.
30059 // FIXME: It would be easy and harmless to use the target shuffle mask
30060 // extraction tool to support more.
30061 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
30064 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
30065 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
30067 SDValue V1 = N->getOperand(0);
30068 SDValue V2 = N->getOperand(1);
30070 unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB;
30071 unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD;
30073 // We require the first shuffle operand to be the ExpectedOpcode node,
30074 // and the second to be the NextExpectedOpcode node.
30075 if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) {
30076 ShuffleVectorSDNode::commuteMask(Mask);
30078 } else if (V1.getOpcode() != ExpectedOpcode || V2.getOpcode() != NextExpectedOpcode)
30081 // If there are other uses of these operations we can't fold them.
30082 if (!V1->hasOneUse() || !V2->hasOneUse())
30085 // Ensure that both operations have the same operands. Note that we can
30086 // commute the FADD operands.
30087 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
30088 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
30089 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
30092 // We're looking for blends between FADD and FSUB nodes. We insist on these
30093 // nodes being lined up in a specific expected pattern.
30094 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
30095 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
30096 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
30097 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
30098 8, 25, 10, 27, 12, 29, 14, 31})))
30106 /// \brief Try to combine a shuffle into a target-specific add-sub or
30107 /// mul-add-sub node.
30108 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
30109 const X86Subtarget &Subtarget,
30110 SelectionDAG &DAG) {
30111 SDValue Opnd0, Opnd1;
30112 if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1))
30115 EVT VT = N->getValueType(0);
30118 // Try to generate X86ISD::FMADDSUB node here.
30120 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
30121 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
30123 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
30124 // the ADDSUB idiom has been successfully recognized. There are no known
30125 // X86 targets with 512-bit ADDSUB instructions!
30126 if (VT.is512BitVector())
30129 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
30132 /// \brief Try to combine a shuffle into a target-specific
30133 /// mul-sub-add node.
30134 static SDValue combineShuffleToFMSubAdd(SDNode *N,
30135 const X86Subtarget &Subtarget,
30136 SelectionDAG &DAG) {
30137 SDValue Opnd0, Opnd1;
30138 if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true))
30141 EVT VT = N->getValueType(0);
30144 // Try to generate X86ISD::FMSUBADD node here.
30146 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
30147 return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2);
30152 // We are looking for a shuffle where both sources are concatenated with undef
30153 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
30154 // if we can express this as a single-source shuffle, that's preferable.
30155 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
30156 const X86Subtarget &Subtarget) {
30157 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
30160 EVT VT = N->getValueType(0);
30162 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
30163 if (!VT.is128BitVector() && !VT.is256BitVector())
30166 if (VT.getVectorElementType() != MVT::i32 &&
30167 VT.getVectorElementType() != MVT::i64 &&
30168 VT.getVectorElementType() != MVT::f32 &&
30169 VT.getVectorElementType() != MVT::f64)
30172 SDValue N0 = N->getOperand(0);
30173 SDValue N1 = N->getOperand(1);
30175 // Check that both sources are concats with undef.
30176 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
30177 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
30178 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
30179 !N1.getOperand(1).isUndef())
30182 // Construct the new shuffle mask. Elements from the first source retain their
30183 // index, but elements from the second source no longer need to skip an undef.
30184 SmallVector<int, 8> Mask;
30185 int NumElts = VT.getVectorNumElements();
30187 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
30188 for (int Elt : SVOp->getMask())
30189 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
30192 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
30194 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
30197 /// Eliminate a redundant shuffle of a horizontal math op.
30198 static SDValue foldShuffleOfHorizOp(SDNode *N) {
30199 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
30202 SDValue HOp = N->getOperand(0);
30203 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
30204 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
30207 // 128-bit horizontal math instructions are defined to operate on adjacent
30208 // lanes of each operand as:
30209 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
30210 // ...similarly for v2f64 and v8i16.
30211 // TODO: 256-bit is not the same because...x86.
30212 if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
30215 // When the operands of a horizontal math op are identical, the low half of
30216 // the result is the same as the high half. If the shuffle is also replicating
30217 // low and high halves, we don't need the shuffle.
30218 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
30219 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
30220 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
30221 // but this should be tied to whatever horizontal op matching and shuffle
30222 // canonicalization are producing.
30223 if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
30224 isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
30225 isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
30231 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
30232 TargetLowering::DAGCombinerInfo &DCI,
30233 const X86Subtarget &Subtarget) {
30235 EVT VT = N->getValueType(0);
30236 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30237 // If we have legalized the vector types, look for blends of FADD and FSUB
30238 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
30239 if (TLI.isTypeLegal(VT)) {
30240 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
30243 if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG))
30246 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
30250 // During Type Legalization, when promoting illegal vector types,
30251 // the backend might introduce new shuffle dag nodes and bitcasts.
30253 // This code performs the following transformation:
30254 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
30255 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
30257 // We do this only if both the bitcast and the BINOP dag nodes have
30258 // one use. Also, perform this transformation only if the new binary
30259 // operation is legal. This is to avoid introducing dag nodes that
30260 // potentially need to be further expanded (or custom lowered) into a
30261 // less optimal sequence of dag nodes.
30262 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
30263 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
30264 N->getOperand(0).getOpcode() == ISD::BITCAST &&
30265 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
30266 SDValue N0 = N->getOperand(0);
30267 SDValue N1 = N->getOperand(1);
30269 SDValue BC0 = N0.getOperand(0);
30270 EVT SVT = BC0.getValueType();
30271 unsigned Opcode = BC0.getOpcode();
30272 unsigned NumElts = VT.getVectorNumElements();
30274 if (BC0.hasOneUse() && SVT.isVector() &&
30275 SVT.getVectorNumElements() * 2 == NumElts &&
30276 TLI.isOperationLegal(Opcode, VT)) {
30277 bool CanFold = false;
30283 // isOperationLegal lies for integer ops on floating point types.
30284 CanFold = VT.isInteger();
30289 // isOperationLegal lies for floating point ops on integer types.
30290 CanFold = VT.isFloatingPoint();
30294 unsigned SVTNumElts = SVT.getVectorNumElements();
30295 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
30296 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
30297 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
30298 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
30299 CanFold = SVOp->getMaskElt(i) < 0;
30302 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
30303 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
30304 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
30305 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
30310 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
30311 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
30312 // consecutive, non-overlapping, and in the right order.
30313 SmallVector<SDValue, 16> Elts;
30314 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
30315 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
30316 Elts.push_back(Elt);
30323 if (Elts.size() == VT.getVectorNumElements())
30325 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
30328 // For AVX2, we sometimes want to combine
30329 // (vector_shuffle <mask> (concat_vectors t1, undef)
30330 // (concat_vectors t2, undef))
30332 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
30333 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
30334 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
30337 if (isTargetShuffle(N->getOpcode())) {
30339 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
30342 // Try recursively combining arbitrary sequences of x86 shuffle
30343 // instructions into higher-order shuffles. We do this after combining
30344 // specific PSHUF instruction sequences into their minimal form so that we
30345 // can evaluate how many specialized shuffle instructions are involved in
30346 // a particular chain.
30347 if (SDValue Res = combineX86ShufflesRecursively(
30348 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
30349 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
30350 DCI.CombineTo(N, Res);
30358 /// Check if a vector extract from a target-specific shuffle of a load can be
30359 /// folded into a single element load.
30360 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
30361 /// shuffles have been custom lowered so we need to handle those here.
30362 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
30363 TargetLowering::DAGCombinerInfo &DCI) {
30364 if (DCI.isBeforeLegalizeOps())
30367 SDValue InVec = N->getOperand(0);
30368 SDValue EltNo = N->getOperand(1);
30369 EVT EltVT = N->getValueType(0);
30371 if (!isa<ConstantSDNode>(EltNo))
30374 EVT OriginalVT = InVec.getValueType();
30376 // Peek through bitcasts, don't duplicate a load with other uses.
30377 InVec = peekThroughOneUseBitcasts(InVec);
30379 EVT CurrentVT = InVec.getValueType();
30380 if (!CurrentVT.isVector() ||
30381 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
30384 if (!isTargetShuffle(InVec.getOpcode()))
30387 // Don't duplicate a load with other uses.
30388 if (!InVec.hasOneUse())
30391 SmallVector<int, 16> ShuffleMask;
30392 SmallVector<SDValue, 2> ShuffleOps;
30394 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
30395 ShuffleOps, ShuffleMask, UnaryShuffle))
30398 // Select the input vector, guarding against out of range extract vector.
30399 unsigned NumElems = CurrentVT.getVectorNumElements();
30400 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
30401 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
30403 if (Idx == SM_SentinelZero)
30404 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
30405 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
30406 if (Idx == SM_SentinelUndef)
30407 return DAG.getUNDEF(EltVT);
30409 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
30410 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
30413 // If inputs to shuffle are the same for both ops, then allow 2 uses
30414 unsigned AllowedUses =
30415 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
30417 if (LdNode.getOpcode() == ISD::BITCAST) {
30418 // Don't duplicate a load with other uses.
30419 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
30422 AllowedUses = 1; // only allow 1 load use if we have a bitcast
30423 LdNode = LdNode.getOperand(0);
30426 if (!ISD::isNormalLoad(LdNode.getNode()))
30429 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
30431 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
30434 // If there's a bitcast before the shuffle, check if the load type and
30435 // alignment is valid.
30436 unsigned Align = LN0->getAlignment();
30437 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30438 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
30439 EltVT.getTypeForEVT(*DAG.getContext()));
30441 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
30444 // All checks match so transform back to vector_shuffle so that DAG combiner
30445 // can finish the job
30448 // Create shuffle node taking into account the case that its a unary shuffle
30449 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
30450 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
30452 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
30453 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
30457 // Try to match patterns such as
30458 // (i16 bitcast (v16i1 x))
30460 // (i16 movmsk (16i8 sext (v16i1 x)))
30461 // before the illegal vector is scalarized on subtargets that don't have legal
30463 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
30464 const X86Subtarget &Subtarget) {
30465 EVT VT = BitCast.getValueType();
30466 SDValue N0 = BitCast.getOperand(0);
30467 EVT VecVT = N0->getValueType(0);
30469 if (!VT.isScalarInteger() || !VecVT.isSimple())
30472 // With AVX512 vxi1 types are legal and we prefer using k-regs.
30473 // MOVMSK is supported in SSE2 or later.
30474 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
30477 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
30478 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
30479 // v8i16 and v16i16.
30480 // For these two cases, we can shuffle the upper element bytes to a
30481 // consecutive sequence at the start of the vector and treat the results as
30482 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
30483 // for v16i16 this is not the case, because the shuffle is expensive, so we
30484 // avoid sign-extending to this type entirely.
30485 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
30486 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
30488 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
30489 switch (VecVT.getSimpleVT().SimpleTy) {
30493 SExtVT = MVT::v2i64;
30494 FPCastVT = MVT::v2f64;
30497 SExtVT = MVT::v4i32;
30498 FPCastVT = MVT::v4f32;
30499 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
30500 // sign-extend to a 256-bit operation to avoid truncation.
30501 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30502 N0->getOperand(0).getValueType().is256BitVector()) {
30503 SExtVT = MVT::v4i64;
30504 FPCastVT = MVT::v4f64;
30508 SExtVT = MVT::v8i16;
30509 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
30510 // sign-extend to a 256-bit operation to match the compare.
30511 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
30512 // 256-bit because the shuffle is cheaper than sign extending the result of
30514 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30515 (N0->getOperand(0).getValueType().is256BitVector() ||
30516 N0->getOperand(0).getValueType().is512BitVector())) {
30517 SExtVT = MVT::v8i32;
30518 FPCastVT = MVT::v8f32;
30522 SExtVT = MVT::v16i8;
30523 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
30524 // it is not profitable to sign-extend to 256-bit because this will
30525 // require an extra cross-lane shuffle which is more expensive than
30526 // truncating the result of the compare to 128-bits.
30529 SExtVT = MVT::v32i8;
30534 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
30536 if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30537 // Handle pre-AVX2 cases by splitting to two v16i1's.
30538 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30539 MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
30540 SDValue Lo = extract128BitVector(V, 0, DAG, DL);
30541 SDValue Hi = extract128BitVector(V, 16, DAG, DL);
30542 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30543 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30544 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30545 DAG.getConstant(16, DL, ShiftTy));
30546 V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30547 return DAG.getZExtOrTrunc(V, DL, VT);
30550 if (SExtVT == MVT::v8i16) {
30551 assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
30552 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
30553 DAG.getUNDEF(MVT::v8i16));
30555 assert(SExtVT.getScalarType() != MVT::i16 &&
30556 "Vectors of i16 must be packed");
30557 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
30558 V = DAG.getBitcast(FPCastVT, V);
30559 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30560 return DAG.getZExtOrTrunc(V, DL, VT);
30563 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
30564 TargetLowering::DAGCombinerInfo &DCI,
30565 const X86Subtarget &Subtarget) {
30566 SDValue N0 = N->getOperand(0);
30567 EVT VT = N->getValueType(0);
30568 EVT SrcVT = N0.getValueType();
30570 // Try to match patterns such as
30571 // (i16 bitcast (v16i1 x))
30573 // (i16 movmsk (16i8 sext (v16i1 x)))
30574 // before the setcc result is scalarized on subtargets that don't have legal
30576 if (DCI.isBeforeLegalize()) {
30577 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
30580 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
30581 // type, widen both sides to avoid a trip through memory.
30582 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
30583 Subtarget.hasVLX()) {
30585 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
30586 N0 = DAG.getBitcast(MVT::v8i1, N0);
30587 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
30588 DAG.getIntPtrConstant(0, dl));
30591 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
30592 // type, widen both sides to avoid a trip through memory.
30593 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
30594 Subtarget.hasVLX()) {
30596 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
30597 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
30599 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
30600 N0 = DAG.getBitcast(MVT::i8, N0);
30601 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
30605 // Since MMX types are special and don't usually play with other vector types,
30606 // it's better to handle them early to be sure we emit efficient code by
30607 // avoiding store-load conversions.
30609 // Detect bitcasts between i32 to x86mmx low word.
30610 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
30611 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
30612 SDValue N00 = N0->getOperand(0);
30613 if (N00.getValueType() == MVT::i32)
30614 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
30617 // Detect bitcasts between element or subvector extraction to x86mmx.
30618 if (VT == MVT::x86mmx &&
30619 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
30620 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
30621 isNullConstant(N0.getOperand(1))) {
30622 SDValue N00 = N0->getOperand(0);
30623 if (N00.getValueType().is128BitVector())
30624 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
30625 DAG.getBitcast(MVT::v2i64, N00));
30628 // Detect bitcasts from FP_TO_SINT to x86mmx.
30629 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
30630 N0.getOpcode() == ISD::FP_TO_SINT) {
30632 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
30633 DAG.getUNDEF(MVT::v2i32));
30634 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
30635 DAG.getBitcast(MVT::v2i64, Res));
30638 // Convert a bitcasted integer logic operation that has one bitcasted
30639 // floating-point operand into a floating-point logic operation. This may
30640 // create a load of a constant, but that is cheaper than materializing the
30641 // constant in an integer register and transferring it to an SSE register or
30642 // transferring the SSE operand to integer register and back.
30644 switch (N0.getOpcode()) {
30645 case ISD::AND: FPOpcode = X86ISD::FAND; break;
30646 case ISD::OR: FPOpcode = X86ISD::FOR; break;
30647 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
30648 default: return SDValue();
30651 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
30652 (Subtarget.hasSSE2() && VT == MVT::f64)))
30655 SDValue LogicOp0 = N0.getOperand(0);
30656 SDValue LogicOp1 = N0.getOperand(1);
30659 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
30660 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
30661 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
30662 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
30663 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
30664 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
30666 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
30667 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
30668 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
30669 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
30670 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
30671 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
30677 // Match a binop + shuffle pyramid that represents a horizontal reduction over
30678 // the elements of a vector.
30679 // Returns the vector that is being reduced on, or SDValue() if a reduction
30680 // was not matched.
30681 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
30682 ArrayRef<ISD::NodeType> CandidateBinOps) {
30683 // The pattern must end in an extract from index 0.
30684 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
30685 !isNullConstant(Extract->getOperand(1)))
30688 SDValue Op = Extract->getOperand(0);
30689 unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
30691 // Match against one of the candidate binary ops.
30692 if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
30693 return Op.getOpcode() == unsigned(BinOp);
30697 // At each stage, we're looking for something that looks like:
30698 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
30699 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
30700 // i32 undef, i32 undef, i32 undef, i32 undef>
30701 // %a = binop <8 x i32> %op, %s
30702 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
30703 // we expect something like:
30704 // <4,5,6,7,u,u,u,u>
30705 // <2,3,u,u,u,u,u,u>
30706 // <1,u,u,u,u,u,u,u>
30707 unsigned CandidateBinOp = Op.getOpcode();
30708 for (unsigned i = 0; i < Stages; ++i) {
30709 if (Op.getOpcode() != CandidateBinOp)
30712 ShuffleVectorSDNode *Shuffle =
30713 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
30715 Op = Op.getOperand(1);
30717 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
30718 Op = Op.getOperand(0);
30721 // The first operand of the shuffle should be the same as the other operand
30723 if (!Shuffle || Shuffle->getOperand(0) != Op)
30726 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
30727 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
30728 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
30732 BinOp = CandidateBinOp;
30736 // Given a select, detect the following pattern:
30737 // 1: %2 = zext <N x i8> %0 to <N x i32>
30738 // 2: %3 = zext <N x i8> %1 to <N x i32>
30739 // 3: %4 = sub nsw <N x i32> %2, %3
30740 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
30741 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
30742 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
30743 // This is useful as it is the input into a SAD pattern.
30744 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
30746 // Check the condition of the select instruction is greater-than.
30747 SDValue SetCC = Select->getOperand(0);
30748 if (SetCC.getOpcode() != ISD::SETCC)
30750 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
30751 if (CC != ISD::SETGT && CC != ISD::SETLT)
30754 SDValue SelectOp1 = Select->getOperand(1);
30755 SDValue SelectOp2 = Select->getOperand(2);
30757 // The following instructions assume SelectOp1 is the subtraction operand
30758 // and SelectOp2 is the negation operand.
30759 // In the case of SETLT this is the other way around.
30760 if (CC == ISD::SETLT)
30761 std::swap(SelectOp1, SelectOp2);
30763 // The second operand of the select should be the negation of the first
30764 // operand, which is implemented as 0 - SelectOp1.
30765 if (!(SelectOp2.getOpcode() == ISD::SUB &&
30766 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
30767 SelectOp2.getOperand(1) == SelectOp1))
30770 // The first operand of SetCC is the first operand of the select, which is the
30771 // difference between the two input vectors.
30772 if (SetCC.getOperand(0) != SelectOp1)
30775 // In SetLT case, The second operand of the comparison can be either 1 or 0.
30777 if ((CC == ISD::SETLT) &&
30778 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
30779 SplatVal.isOneValue()) ||
30780 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
30783 // In SetGT case, The second operand of the comparison can be either -1 or 0.
30784 if ((CC == ISD::SETGT) &&
30785 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
30786 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
30789 // The first operand of the select is the difference between the two input
30791 if (SelectOp1.getOpcode() != ISD::SUB)
30794 Op0 = SelectOp1.getOperand(0);
30795 Op1 = SelectOp1.getOperand(1);
30797 // Check if the operands of the sub are zero-extended from vectors of i8.
30798 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
30799 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
30800 Op1.getOpcode() != ISD::ZERO_EXTEND ||
30801 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
30807 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
30809 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
30810 const SDValue &Zext1, const SDLoc &DL) {
30812 // Find the appropriate width for the PSADBW.
30813 EVT InVT = Zext0.getOperand(0).getValueType();
30814 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
30816 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
30817 // fill in the missing vector elements with 0.
30818 unsigned NumConcat = RegSize / InVT.getSizeInBits();
30819 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
30820 Ops[0] = Zext0.getOperand(0);
30821 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
30822 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30823 Ops[0] = Zext1.getOperand(0);
30824 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30826 // Actually build the SAD
30827 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
30828 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
30831 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
30833 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
30834 const X86Subtarget &Subtarget) {
30835 // Bail without SSE41.
30836 if (!Subtarget.hasSSE41())
30839 EVT ExtractVT = Extract->getValueType(0);
30840 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
30843 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
30845 SDValue Src = matchBinOpReduction(
30846 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
30850 EVT SrcVT = Src.getValueType();
30851 EVT SrcSVT = SrcVT.getScalarType();
30852 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
30856 SDValue MinPos = Src;
30858 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
30859 while (SrcVT.getSizeInBits() > 128) {
30860 unsigned NumElts = SrcVT.getVectorNumElements();
30861 unsigned NumSubElts = NumElts / 2;
30862 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
30863 unsigned SubSizeInBits = SrcVT.getSizeInBits();
30864 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
30865 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
30866 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
30868 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
30869 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
30870 "Unexpected value type");
30872 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
30873 // to flip the value accordingly.
30875 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
30876 if (BinOp == ISD::SMAX)
30877 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
30878 else if (BinOp == ISD::SMIN)
30879 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
30880 else if (BinOp == ISD::UMAX)
30881 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
30884 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
30886 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
30887 // shuffling each upper element down and insert zeros. This means that the
30888 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
30889 // ready for the PHMINPOS.
30890 if (ExtractVT == MVT::i8) {
30891 SDValue Upper = DAG.getVectorShuffle(
30892 SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
30893 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
30894 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
30897 // Perform the PHMINPOS on a v8i16 vector,
30898 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
30899 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
30900 MinPos = DAG.getBitcast(SrcVT, MinPos);
30903 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
30905 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
30906 DAG.getIntPtrConstant(0, DL));
30909 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
30910 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
30912 const X86Subtarget &Subtarget) {
30913 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
30914 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
30917 EVT ExtractVT = Extract->getValueType(0);
30918 unsigned BitWidth = ExtractVT.getSizeInBits();
30919 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
30920 ExtractVT != MVT::i8)
30923 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
30924 unsigned BinOp = 0;
30925 SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
30929 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
30930 // which we can't support here for now.
30931 if (Match.getScalarValueSizeInBits() != BitWidth)
30934 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
30935 unsigned MatchSizeInBits = Match.getValueSizeInBits();
30936 if (!(MatchSizeInBits == 128 ||
30937 (MatchSizeInBits == 256 &&
30938 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
30941 // Don't bother performing this for 2-element vectors.
30942 if (Match.getValueType().getVectorNumElements() <= 2)
30945 // Check that we are extracting a reduction of all sign bits.
30946 if (DAG.ComputeNumSignBits(Match) != BitWidth)
30949 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
30951 if (64 == BitWidth || 32 == BitWidth)
30952 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
30953 MatchSizeInBits / BitWidth);
30955 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
30958 ISD::CondCode CondCode;
30959 if (BinOp == ISD::OR) {
30960 // any_of -> MOVMSK != 0
30961 CompareBits = APInt::getNullValue(32);
30962 CondCode = ISD::CondCode::SETNE;
30964 // all_of -> MOVMSK == ((1 << NumElts) - 1)
30965 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
30966 CondCode = ISD::CondCode::SETEQ;
30969 // Perform the select as i32/i64 and then truncate to avoid partial register
30971 unsigned ResWidth = std::max(BitWidth, 32u);
30972 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
30974 SDValue Zero = DAG.getConstant(0, DL, ResVT);
30975 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
30976 SDValue Res = DAG.getBitcast(MaskVT, Match);
30977 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
30978 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
30979 Ones, Zero, CondCode);
30980 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
30983 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
30984 const X86Subtarget &Subtarget) {
30985 // PSADBW is only supported on SSE2 and up.
30986 if (!Subtarget.hasSSE2())
30989 // Verify the type we're extracting from is any integer type above i16.
30990 EVT VT = Extract->getOperand(0).getValueType();
30991 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
30994 unsigned RegSize = 128;
30995 if (Subtarget.hasBWI())
30997 else if (Subtarget.hasAVX2())
31000 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
31001 // TODO: We should be able to handle larger vectors by splitting them before
31002 // feeding them into several SADs, and then reducing over those.
31003 if (RegSize / VT.getVectorNumElements() < 8)
31006 // Match shuffle + add pyramid.
31007 unsigned BinOp = 0;
31008 SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
31010 // The operand is expected to be zero extended from i8
31011 // (verified in detectZextAbsDiff).
31012 // In order to convert to i64 and above, additional any/zero/sign
31013 // extend is expected.
31014 // The zero extend from 32 bit has no mathematical effect on the result.
31015 // Also the sign extend is basically zero extend
31016 // (extends the sign bit which is zero).
31017 // So it is correct to skip the sign/zero extend instruction.
31018 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
31019 Root.getOpcode() == ISD::ZERO_EXTEND ||
31020 Root.getOpcode() == ISD::ANY_EXTEND))
31021 Root = Root.getOperand(0);
31023 // If there was a match, we want Root to be a select that is the root of an
31024 // abs-diff pattern.
31025 if (!Root || (Root.getOpcode() != ISD::VSELECT))
31028 // Check whether we have an abs-diff pattern feeding into the select.
31029 SDValue Zext0, Zext1;
31030 if (!detectZextAbsDiff(Root, Zext0, Zext1))
31033 // Create the SAD instruction.
31035 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
31037 // If the original vector was wider than 8 elements, sum over the results
31038 // in the SAD vector.
31039 unsigned Stages = Log2_32(VT.getVectorNumElements());
31040 MVT SadVT = SAD.getSimpleValueType();
31042 unsigned SadElems = SadVT.getVectorNumElements();
31044 for(unsigned i = Stages - 3; i > 0; --i) {
31045 SmallVector<int, 16> Mask(SadElems, -1);
31046 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
31047 Mask[j] = MaskEnd + j;
31050 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
31051 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
31055 MVT Type = Extract->getSimpleValueType(0);
31056 unsigned TypeSizeInBits = Type.getSizeInBits();
31057 // Return the lowest TypeSizeInBits bits.
31058 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
31059 SAD = DAG.getBitcast(ResVT, SAD);
31060 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
31061 Extract->getOperand(1));
31064 // Attempt to peek through a target shuffle and extract the scalar from the
31066 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
31067 TargetLowering::DAGCombinerInfo &DCI,
31068 const X86Subtarget &Subtarget) {
31069 if (DCI.isBeforeLegalizeOps())
31072 SDValue Src = N->getOperand(0);
31073 SDValue Idx = N->getOperand(1);
31075 EVT VT = N->getValueType(0);
31076 EVT SrcVT = Src.getValueType();
31077 EVT SrcSVT = SrcVT.getVectorElementType();
31078 unsigned NumSrcElts = SrcVT.getVectorNumElements();
31080 // Don't attempt this for boolean mask vectors or unknown extraction indices.
31081 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
31084 // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
31085 if (X86ISD::VBROADCAST == Src.getOpcode() &&
31086 Src.getOperand(0).getValueType() == VT)
31087 return Src.getOperand(0);
31089 // Resolve the target shuffle inputs and mask.
31090 SmallVector<int, 16> Mask;
31091 SmallVector<SDValue, 2> Ops;
31092 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
31095 // Attempt to narrow/widen the shuffle mask to the correct size.
31096 if (Mask.size() != NumSrcElts) {
31097 if ((NumSrcElts % Mask.size()) == 0) {
31098 SmallVector<int, 16> ScaledMask;
31099 int Scale = NumSrcElts / Mask.size();
31100 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
31101 Mask = std::move(ScaledMask);
31102 } else if ((Mask.size() % NumSrcElts) == 0) {
31103 SmallVector<int, 16> WidenedMask;
31104 while (Mask.size() > NumSrcElts &&
31105 canWidenShuffleElements(Mask, WidenedMask))
31106 Mask = std::move(WidenedMask);
31107 // TODO - investigate support for wider shuffle masks with known upper
31108 // undef/zero elements for implicit zero-extension.
31112 // Check if narrowing/widening failed.
31113 if (Mask.size() != NumSrcElts)
31116 int SrcIdx = Mask[N->getConstantOperandVal(1)];
31119 // If the shuffle source element is undef/zero then we can just accept it.
31120 if (SrcIdx == SM_SentinelUndef)
31121 return DAG.getUNDEF(VT);
31123 if (SrcIdx == SM_SentinelZero)
31124 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
31125 : DAG.getConstant(0, dl, VT);
31127 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
31128 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
31129 SrcIdx = SrcIdx % Mask.size();
31131 // We can only extract other elements from 128-bit vectors and in certain
31132 // circumstances, depending on SSE-level.
31133 // TODO: Investigate using extract_subvector for larger vectors.
31134 // TODO: Investigate float/double extraction if it will be just stored.
31135 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
31136 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
31137 assert(SrcSVT == VT && "Unexpected extraction type");
31138 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
31139 DAG.getIntPtrConstant(SrcIdx, dl));
31142 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
31143 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
31144 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
31145 "Unexpected extraction type");
31146 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
31147 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
31148 DAG.getIntPtrConstant(SrcIdx, dl));
31149 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
31155 /// Detect vector gather/scatter index generation and convert it from being a
31156 /// bunch of shuffles and extracts into a somewhat faster sequence.
31157 /// For i686, the best sequence is apparently storing the value and loading
31158 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
31159 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
31160 TargetLowering::DAGCombinerInfo &DCI,
31161 const X86Subtarget &Subtarget) {
31162 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
31165 // TODO - Remove this once we can handle the implicit zero-extension of
31166 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
31167 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
31168 // combineBasicSADPattern.
31169 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
31172 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
31175 SDValue InputVector = N->getOperand(0);
31176 SDValue EltIdx = N->getOperand(1);
31178 EVT SrcVT = InputVector.getValueType();
31179 EVT VT = N->getValueType(0);
31180 SDLoc dl(InputVector);
31182 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
31183 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
31184 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
31185 SDValue MMXSrc = InputVector.getOperand(0);
31187 // The bitcast source is a direct mmx result.
31188 if (MMXSrc.getValueType() == MVT::x86mmx)
31189 return DAG.getBitcast(VT, InputVector);
31192 // Detect mmx to i32 conversion through a v2i32 elt extract.
31193 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
31194 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
31195 SDValue MMXSrc = InputVector.getOperand(0);
31197 // The bitcast source is a direct mmx result.
31198 if (MMXSrc.getValueType() == MVT::x86mmx)
31199 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
31202 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
31203 isa<ConstantSDNode>(EltIdx) &&
31204 isa<ConstantSDNode>(InputVector.getOperand(0))) {
31205 uint64_t ExtractedElt = N->getConstantOperandVal(1);
31206 uint64_t InputValue = InputVector.getConstantOperandVal(0);
31207 uint64_t Res = (InputValue >> ExtractedElt) & 1;
31208 return DAG.getConstant(Res, dl, MVT::i1);
31211 // Check whether this extract is the root of a sum of absolute differences
31212 // pattern. This has to be done here because we really want it to happen
31213 // pre-legalization,
31214 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
31217 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
31218 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
31221 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
31222 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
31225 // Only operate on vectors of 4 elements, where the alternative shuffling
31226 // gets to be more expensive.
31227 if (SrcVT != MVT::v4i32)
31230 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
31231 // single use which is a sign-extend or zero-extend, and all elements are
31233 SmallVector<SDNode *, 4> Uses;
31234 unsigned ExtractedElements = 0;
31235 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
31236 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
31237 if (UI.getUse().getResNo() != InputVector.getResNo())
31240 SDNode *Extract = *UI;
31241 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
31244 if (Extract->getValueType(0) != MVT::i32)
31246 if (!Extract->hasOneUse())
31248 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
31249 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
31251 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
31254 // Record which element was extracted.
31255 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
31256 Uses.push_back(Extract);
31259 // If not all the elements were used, this may not be worthwhile.
31260 if (ExtractedElements != 15)
31263 // Ok, we've now decided to do the transformation.
31264 // If 64-bit shifts are legal, use the extract-shift sequence,
31265 // otherwise bounce the vector off the cache.
31266 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31269 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
31270 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
31271 auto &DL = DAG.getDataLayout();
31272 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
31273 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
31274 DAG.getConstant(0, dl, VecIdxTy));
31275 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
31276 DAG.getConstant(1, dl, VecIdxTy));
31278 SDValue ShAmt = DAG.getConstant(
31279 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
31280 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
31281 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
31282 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
31283 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
31284 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
31285 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
31287 // Store the value to a temporary stack slot.
31288 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
31289 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
31290 MachinePointerInfo());
31292 EVT ElementType = SrcVT.getVectorElementType();
31293 unsigned EltSize = ElementType.getSizeInBits() / 8;
31295 // Replace each use (extract) with a load of the appropriate element.
31296 for (unsigned i = 0; i < 4; ++i) {
31297 uint64_t Offset = EltSize * i;
31298 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
31299 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
31301 SDValue ScalarAddr =
31302 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
31304 // Load the scalar.
31306 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
31310 // Replace the extracts
31311 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
31312 UE = Uses.end(); UI != UE; ++UI) {
31313 SDNode *Extract = *UI;
31315 uint64_t IdxVal = Extract->getConstantOperandVal(1);
31316 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
31319 // The replacement was made in place; return N so it won't be revisited.
31320 return SDValue(N, 0);
31323 /// If a vector select has an operand that is -1 or 0, try to simplify the
31324 /// select to a bitwise logic operation.
31325 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
31327 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
31328 TargetLowering::DAGCombinerInfo &DCI,
31329 const X86Subtarget &Subtarget) {
31330 SDValue Cond = N->getOperand(0);
31331 SDValue LHS = N->getOperand(1);
31332 SDValue RHS = N->getOperand(2);
31333 EVT VT = LHS.getValueType();
31334 EVT CondVT = Cond.getValueType();
31336 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31338 if (N->getOpcode() != ISD::VSELECT)
31341 assert(CondVT.isVector() && "Vector select expects a vector selector!");
31343 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
31344 // Check if the first operand is all zeros and Cond type is vXi1.
31345 // This situation only applies to avx512.
31346 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
31347 CondVT.getVectorElementType() == MVT::i1) {
31348 // Invert the cond to not(cond) : xor(op,allones)=not(op)
31349 SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
31350 DAG.getAllOnesConstant(DL, CondVT));
31351 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
31352 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
31355 // To use the condition operand as a bitwise mask, it must have elements that
31356 // are the same size as the select elements. Ie, the condition operand must
31357 // have already been promoted from the IR select condition type <N x i1>.
31358 // Don't check if the types themselves are equal because that excludes
31359 // vector floating-point selects.
31360 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
31363 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
31364 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
31366 // Try to invert the condition if true value is not all 1s and false value is
31368 if (!TValIsAllOnes && !FValIsAllZeros &&
31369 // Check if the selector will be produced by CMPP*/PCMP*.
31370 Cond.getOpcode() == ISD::SETCC &&
31371 // Check if SETCC has already been promoted.
31372 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
31374 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
31376 if (TValIsAllZeros || FValIsAllOnes) {
31377 SDValue CC = Cond.getOperand(2);
31378 ISD::CondCode NewCC =
31379 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
31380 Cond.getOperand(0).getValueType().isInteger());
31381 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
31383 std::swap(LHS, RHS);
31384 TValIsAllOnes = FValIsAllOnes;
31385 FValIsAllZeros = TValIsAllZeros;
31389 // Cond value must be 'sign splat' to be converted to a logical op.
31390 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
31393 // vselect Cond, 111..., 000... -> Cond
31394 if (TValIsAllOnes && FValIsAllZeros)
31395 return DAG.getBitcast(VT, Cond);
31397 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
31400 // vselect Cond, 111..., X -> or Cond, X
31401 if (TValIsAllOnes) {
31402 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
31403 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
31404 return DAG.getBitcast(VT, Or);
31407 // vselect Cond, X, 000... -> and Cond, X
31408 if (FValIsAllZeros) {
31409 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
31410 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
31411 return DAG.getBitcast(VT, And);
31414 // vselect Cond, 000..., X -> andn Cond, X
31415 if (TValIsAllZeros) {
31416 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
31417 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
31418 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
31419 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
31420 return DAG.getBitcast(VT, AndN);
31426 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
31427 SDValue Cond = N->getOperand(0);
31428 SDValue LHS = N->getOperand(1);
31429 SDValue RHS = N->getOperand(2);
31432 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
31433 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
31434 if (!TrueC || !FalseC)
31437 // Don't do this for crazy integer types.
31438 EVT VT = N->getValueType(0);
31439 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31442 // We're going to use the condition bit in math or logic ops. We could allow
31443 // this with a wider condition value (post-legalization it becomes an i8),
31444 // but if nothing is creating selects that late, it doesn't matter.
31445 if (Cond.getValueType() != MVT::i1)
31448 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
31449 // 3, 5, or 9 with i32/i64, so those get transformed too.
31450 // TODO: For constants that overflow or do not differ by power-of-2 or small
31451 // multiplier, convert to 'and' + 'add'.
31452 const APInt &TrueVal = TrueC->getAPIntValue();
31453 const APInt &FalseVal = FalseC->getAPIntValue();
31455 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
31459 APInt AbsDiff = Diff.abs();
31460 if (AbsDiff.isPowerOf2() ||
31461 ((VT == MVT::i32 || VT == MVT::i64) &&
31462 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
31464 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
31465 // of the condition can usually be folded into a compare predicate, but even
31466 // without that, the sequence should be cheaper than a CMOV alternative.
31467 if (TrueVal.slt(FalseVal)) {
31468 Cond = DAG.getNOT(DL, Cond, MVT::i1);
31469 std::swap(TrueC, FalseC);
31472 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
31473 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
31475 // Multiply condition by the difference if non-one.
31476 if (!AbsDiff.isOneValue())
31477 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
31479 // Add the base if non-zero.
31480 if (!FalseC->isNullValue())
31481 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
31489 // If this is a bitcasted op that can be represented as another type, push the
31490 // the bitcast to the inputs. This allows more opportunities for pattern
31491 // matching masked instructions. This is called when we know that the operation
31492 // is used as one of the inputs of a vselect.
31493 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
31494 TargetLowering::DAGCombinerInfo &DCI) {
31495 // Make sure we have a bitcast.
31496 if (OrigOp.getOpcode() != ISD::BITCAST)
31499 SDValue Op = OrigOp.getOperand(0);
31501 // If the operation is used by anything other than the bitcast, we shouldn't
31502 // do this combine as that would replicate the operation.
31503 if (!Op.hasOneUse())
31506 MVT VT = OrigOp.getSimpleValueType();
31507 MVT EltVT = VT.getVectorElementType();
31508 SDLoc DL(Op.getNode());
31510 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
31512 Op0 = DAG.getBitcast(VT, Op0);
31513 DCI.AddToWorklist(Op0.getNode());
31514 Op1 = DAG.getBitcast(VT, Op1);
31515 DCI.AddToWorklist(Op1.getNode());
31516 DCI.CombineTo(OrigOp.getNode(),
31517 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
31521 unsigned Opcode = Op.getOpcode();
31523 case X86ISD::SHUF128: {
31524 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
31526 // Only change element size, not type.
31527 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
31529 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
31532 case X86ISD::SUBV_BROADCAST: {
31533 unsigned EltSize = EltVT.getSizeInBits();
31534 if (EltSize != 32 && EltSize != 64)
31536 // Only change element size, not type.
31537 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
31539 SDValue Op0 = Op.getOperand(0);
31540 MVT Op0VT = MVT::getVectorVT(EltVT,
31541 Op0.getSimpleValueType().getSizeInBits() / EltSize);
31542 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
31543 DCI.AddToWorklist(Op0.getNode());
31544 DCI.CombineTo(OrigOp.getNode(),
31545 DAG.getNode(Opcode, DL, VT, Op0));
31553 /// Do target-specific dag combines on SELECT and VSELECT nodes.
31554 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
31555 TargetLowering::DAGCombinerInfo &DCI,
31556 const X86Subtarget &Subtarget) {
31558 SDValue Cond = N->getOperand(0);
31559 // Get the LHS/RHS of the select.
31560 SDValue LHS = N->getOperand(1);
31561 SDValue RHS = N->getOperand(2);
31562 EVT VT = LHS.getValueType();
31563 EVT CondVT = Cond.getValueType();
31564 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31566 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
31567 // instructions match the semantics of the common C idiom x<y?x:y but not
31568 // x<=y?x:y, because of how they handle negative zero (which can be
31569 // ignored in unsafe-math mode).
31570 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
31571 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
31572 VT != MVT::f80 && VT != MVT::f128 &&
31573 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
31574 (Subtarget.hasSSE2() ||
31575 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
31576 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31578 unsigned Opcode = 0;
31579 // Check for x CC y ? x : y.
31580 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
31581 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
31585 // Converting this to a min would handle NaNs incorrectly, and swapping
31586 // the operands would cause it to handle comparisons between positive
31587 // and negative zero incorrectly.
31588 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
31589 if (!DAG.getTarget().Options.UnsafeFPMath &&
31590 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
31592 std::swap(LHS, RHS);
31594 Opcode = X86ISD::FMIN;
31597 // Converting this to a min would handle comparisons between positive
31598 // and negative zero incorrectly.
31599 if (!DAG.getTarget().Options.UnsafeFPMath &&
31600 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
31602 Opcode = X86ISD::FMIN;
31605 // Converting this to a min would handle both negative zeros and NaNs
31606 // incorrectly, but we can swap the operands to fix both.
31607 std::swap(LHS, RHS);
31612 Opcode = X86ISD::FMIN;
31616 // Converting this to a max would handle comparisons between positive
31617 // and negative zero incorrectly.
31618 if (!DAG.getTarget().Options.UnsafeFPMath &&
31619 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
31621 Opcode = X86ISD::FMAX;
31624 // Converting this to a max would handle NaNs incorrectly, and swapping
31625 // the operands would cause it to handle comparisons between positive
31626 // and negative zero incorrectly.
31627 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
31628 if (!DAG.getTarget().Options.UnsafeFPMath &&
31629 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
31631 std::swap(LHS, RHS);
31633 Opcode = X86ISD::FMAX;
31636 // Converting this to a max would handle both negative zeros and NaNs
31637 // incorrectly, but we can swap the operands to fix both.
31638 std::swap(LHS, RHS);
31643 Opcode = X86ISD::FMAX;
31646 // Check for x CC y ? y : x -- a min/max with reversed arms.
31647 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
31648 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
31652 // Converting this to a min would handle comparisons between positive
31653 // and negative zero incorrectly, and swapping the operands would
31654 // cause it to handle NaNs incorrectly.
31655 if (!DAG.getTarget().Options.UnsafeFPMath &&
31656 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
31657 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31659 std::swap(LHS, RHS);
31661 Opcode = X86ISD::FMIN;
31664 // Converting this to a min would handle NaNs incorrectly.
31665 if (!DAG.getTarget().Options.UnsafeFPMath &&
31666 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
31668 Opcode = X86ISD::FMIN;
31671 // Converting this to a min would handle both negative zeros and NaNs
31672 // incorrectly, but we can swap the operands to fix both.
31673 std::swap(LHS, RHS);
31678 Opcode = X86ISD::FMIN;
31682 // Converting this to a max would handle NaNs incorrectly.
31683 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31685 Opcode = X86ISD::FMAX;
31688 // Converting this to a max would handle comparisons between positive
31689 // and negative zero incorrectly, and swapping the operands would
31690 // cause it to handle NaNs incorrectly.
31691 if (!DAG.getTarget().Options.UnsafeFPMath &&
31692 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
31693 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31695 std::swap(LHS, RHS);
31697 Opcode = X86ISD::FMAX;
31700 // Converting this to a max would handle both negative zeros and NaNs
31701 // incorrectly, but we can swap the operands to fix both.
31702 std::swap(LHS, RHS);
31707 Opcode = X86ISD::FMAX;
31713 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
31716 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
31717 // lowering on KNL. In this case we convert it to
31718 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
31719 // The same situation for all 128 and 256-bit vectors of i8 and i16.
31720 // Since SKX these selects have a proper lowering.
31721 if (Subtarget.hasAVX512() && CondVT.isVector() &&
31722 CondVT.getVectorElementType() == MVT::i1 &&
31723 (VT.is128BitVector() || VT.is256BitVector()) &&
31724 (VT.getVectorElementType() == MVT::i8 ||
31725 VT.getVectorElementType() == MVT::i16) &&
31726 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
31727 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
31728 DCI.AddToWorklist(Cond.getNode());
31729 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
31732 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
31735 // Canonicalize max and min:
31736 // (x > y) ? x : y -> (x >= y) ? x : y
31737 // (x < y) ? x : y -> (x <= y) ? x : y
31738 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
31739 // the need for an extra compare
31740 // against zero. e.g.
31741 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
31743 // testl %edi, %edi
31745 // cmovgl %edi, %eax
31749 // cmovsl %eax, %edi
31750 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
31751 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
31752 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
31753 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31758 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
31759 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
31760 Cond.getOperand(0), Cond.getOperand(1), NewCC);
31761 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
31766 // Early exit check
31767 if (!TLI.isTypeLegal(VT))
31770 // Match VSELECTs into subs with unsigned saturation.
31771 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
31772 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
31773 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
31774 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
31775 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31777 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
31778 // left side invert the predicate to simplify logic below.
31780 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
31782 CC = ISD::getSetCCInverse(CC, true);
31783 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
31787 if (Other.getNode() && Other->getNumOperands() == 2 &&
31788 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
31789 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
31790 SDValue CondRHS = Cond->getOperand(1);
31792 // Look for a general sub with unsigned saturation first.
31793 // x >= y ? x-y : 0 --> subus x, y
31794 // x > y ? x-y : 0 --> subus x, y
31795 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
31796 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
31797 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
31799 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
31800 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
31801 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
31802 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
31803 // If the RHS is a constant we have to reverse the const
31804 // canonicalization.
31805 // x > C-1 ? x+-C : 0 --> subus x, C
31806 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
31807 CondRHSConst->getAPIntValue() ==
31808 (-OpRHSConst->getAPIntValue() - 1))
31809 return DAG.getNode(
31810 X86ISD::SUBUS, DL, VT, OpLHS,
31811 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
31813 // Another special case: If C was a sign bit, the sub has been
31814 // canonicalized into a xor.
31815 // FIXME: Would it be better to use computeKnownBits to determine
31816 // whether it's safe to decanonicalize the xor?
31817 // x s< 0 ? x^C : 0 --> subus x, C
31818 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
31819 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
31820 OpRHSConst->getAPIntValue().isSignMask())
31821 // Note that we have to rebuild the RHS constant here to ensure we
31822 // don't rely on particular values of undef lanes.
31823 return DAG.getNode(
31824 X86ISD::SUBUS, DL, VT, OpLHS,
31825 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
31830 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
31833 // If this is a *dynamic* select (non-constant condition) and we can match
31834 // this node with one of the variable blend instructions, restructure the
31835 // condition so that blends can use the high (sign) bit of each element and
31836 // use SimplifyDemandedBits to simplify the condition operand.
31837 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
31838 !DCI.isBeforeLegalize() &&
31839 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
31840 unsigned BitWidth = Cond.getScalarValueSizeInBits();
31842 // Don't optimize vector selects that map to mask-registers.
31846 // We can only handle the cases where VSELECT is directly legal on the
31847 // subtarget. We custom lower VSELECT nodes with constant conditions and
31848 // this makes it hard to see whether a dynamic VSELECT will correctly
31849 // lower, so we both check the operation's status and explicitly handle the
31850 // cases where a *dynamic* blend will fail even though a constant-condition
31851 // blend could be custom lowered.
31852 // FIXME: We should find a better way to handle this class of problems.
31853 // Potentially, we should combine constant-condition vselect nodes
31854 // pre-legalization into shuffles and not mark as many types as custom
31856 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
31858 // FIXME: We don't support i16-element blends currently. We could and
31859 // should support them by making *all* the bits in the condition be set
31860 // rather than just the high bit and using an i8-element blend.
31861 if (VT.getVectorElementType() == MVT::i16)
31863 // Dynamic blending was only available from SSE4.1 onward.
31864 if (VT.is128BitVector() && !Subtarget.hasSSE41())
31866 // Byte blends are only available in AVX2
31867 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
31869 // There are no 512-bit blend instructions that use sign bits.
31870 if (VT.is512BitVector())
31873 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
31874 APInt DemandedMask(APInt::getSignMask(BitWidth));
31876 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
31877 !DCI.isBeforeLegalizeOps());
31878 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
31879 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
31880 // If we changed the computation somewhere in the DAG, this change will
31881 // affect all users of Cond. Make sure it is fine and update all the nodes
31882 // so that we do not use the generic VSELECT anymore. Otherwise, we may
31883 // perform wrong optimizations as we messed with the actual expectation
31884 // for the vector boolean values.
31885 if (Cond != TLO.Old) {
31886 // Check all uses of the condition operand to check whether it will be
31887 // consumed by non-BLEND instructions. Those may require that all bits
31888 // are set properly.
31889 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
31891 // TODO: Add other opcodes eventually lowered into BLEND.
31892 if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
31896 // Update all users of the condition before committing the change, so
31897 // that the VSELECT optimizations that expect the correct vector boolean
31898 // value will not be triggered.
31899 for (SDNode *U : Cond->uses()) {
31900 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
31901 U->getValueType(0), Cond, U->getOperand(1),
31903 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
31905 DCI.CommitTargetLoweringOpt(TLO);
31908 // Only Cond (rather than other nodes in the computation chain) was
31909 // changed. Change the condition just for N to keep the opportunity to
31910 // optimize all other users their own way.
31911 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
31912 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
31917 // Look for vselects with LHS/RHS being bitcasted from an operation that
31918 // can be executed on another type. Push the bitcast to the inputs of
31919 // the operation. This exposes opportunities for using masking instructions.
31920 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
31921 CondVT.getVectorElementType() == MVT::i1) {
31922 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
31923 return SDValue(N, 0);
31924 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
31925 return SDValue(N, 0);
31928 // Custom action for SELECT MMX
31929 if (VT == MVT::x86mmx) {
31930 LHS = DAG.getBitcast(MVT::i64, LHS);
31931 RHS = DAG.getBitcast(MVT::i64, RHS);
31932 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
31933 return DAG.getBitcast(VT, newSelect);
31940 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
31942 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
31943 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
31944 /// Note that this is only legal for some op/cc combinations.
31945 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
31947 const X86Subtarget &Subtarget) {
31948 // This combine only operates on CMP-like nodes.
31949 if (!(Cmp.getOpcode() == X86ISD::CMP ||
31950 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
31953 // Can't replace the cmp if it has more uses than the one we're looking at.
31954 // FIXME: We would like to be able to handle this, but would need to make sure
31955 // all uses were updated.
31956 if (!Cmp.hasOneUse())
31959 // This only applies to variations of the common case:
31960 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
31961 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
31962 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
31963 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
31964 // Using the proper condcodes (see below), overflow is checked for.
31966 // FIXME: We can generalize both constraints:
31967 // - XOR/OR/AND (if they were made to survive AtomicExpand)
31969 // if the result is compared.
31971 SDValue CmpLHS = Cmp.getOperand(0);
31972 SDValue CmpRHS = Cmp.getOperand(1);
31974 if (!CmpLHS.hasOneUse())
31977 unsigned Opc = CmpLHS.getOpcode();
31978 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
31981 SDValue OpRHS = CmpLHS.getOperand(2);
31982 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
31986 APInt Addend = OpRHSC->getAPIntValue();
31987 if (Opc == ISD::ATOMIC_LOAD_SUB)
31990 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
31994 APInt Comparison = CmpRHSC->getAPIntValue();
31996 // If the addend is the negation of the comparison value, then we can do
31997 // a full comparison by emitting the atomic arithmetic as a locked sub.
31998 if (Comparison == -Addend) {
31999 // The CC is fine, but we need to rewrite the LHS of the comparison as an
32001 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
32002 auto AtomicSub = DAG.getAtomic(
32003 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
32004 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
32005 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
32006 AN->getMemOperand());
32007 // If the comparision uses the CF flag we can't use INC/DEC instructions.
32008 bool NeedCF = false;
32011 case X86::COND_A: case X86::COND_AE:
32012 case X86::COND_B: case X86::COND_BE:
32016 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
32017 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
32018 DAG.getUNDEF(CmpLHS.getValueType()));
32019 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
32023 // We can handle comparisons with zero in a number of cases by manipulating
32025 if (!Comparison.isNullValue())
32028 if (CC == X86::COND_S && Addend == 1)
32030 else if (CC == X86::COND_NS && Addend == 1)
32032 else if (CC == X86::COND_G && Addend == -1)
32034 else if (CC == X86::COND_LE && Addend == -1)
32039 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
32040 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
32041 DAG.getUNDEF(CmpLHS.getValueType()));
32042 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
32046 // Check whether a boolean test is testing a boolean value generated by
32047 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
32050 // Simplify the following patterns:
32051 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
32052 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
32053 // to (Op EFLAGS Cond)
32055 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
32056 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
32057 // to (Op EFLAGS !Cond)
32059 // where Op could be BRCOND or CMOV.
32061 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
32062 // This combine only operates on CMP-like nodes.
32063 if (!(Cmp.getOpcode() == X86ISD::CMP ||
32064 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
32067 // Quit if not used as a boolean value.
32068 if (CC != X86::COND_E && CC != X86::COND_NE)
32071 // Check CMP operands. One of them should be 0 or 1 and the other should be
32072 // an SetCC or extended from it.
32073 SDValue Op1 = Cmp.getOperand(0);
32074 SDValue Op2 = Cmp.getOperand(1);
32077 const ConstantSDNode* C = nullptr;
32078 bool needOppositeCond = (CC == X86::COND_E);
32079 bool checkAgainstTrue = false; // Is it a comparison against 1?
32081 if ((C = dyn_cast<ConstantSDNode>(Op1)))
32083 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
32085 else // Quit if all operands are not constants.
32088 if (C->getZExtValue() == 1) {
32089 needOppositeCond = !needOppositeCond;
32090 checkAgainstTrue = true;
32091 } else if (C->getZExtValue() != 0)
32092 // Quit if the constant is neither 0 or 1.
32095 bool truncatedToBoolWithAnd = false;
32096 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
32097 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
32098 SetCC.getOpcode() == ISD::TRUNCATE ||
32099 SetCC.getOpcode() == ISD::AND) {
32100 if (SetCC.getOpcode() == ISD::AND) {
32102 if (isOneConstant(SetCC.getOperand(0)))
32104 if (isOneConstant(SetCC.getOperand(1)))
32108 SetCC = SetCC.getOperand(OpIdx);
32109 truncatedToBoolWithAnd = true;
32111 SetCC = SetCC.getOperand(0);
32114 switch (SetCC.getOpcode()) {
32115 case X86ISD::SETCC_CARRY:
32116 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
32117 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
32118 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
32119 // truncated to i1 using 'and'.
32120 if (checkAgainstTrue && !truncatedToBoolWithAnd)
32122 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
32123 "Invalid use of SETCC_CARRY!");
32125 case X86ISD::SETCC:
32126 // Set the condition code or opposite one if necessary.
32127 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
32128 if (needOppositeCond)
32129 CC = X86::GetOppositeBranchCondition(CC);
32130 return SetCC.getOperand(1);
32131 case X86ISD::CMOV: {
32132 // Check whether false/true value has canonical one, i.e. 0 or 1.
32133 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
32134 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
32135 // Quit if true value is not a constant.
32138 // Quit if false value is not a constant.
32140 SDValue Op = SetCC.getOperand(0);
32141 // Skip 'zext' or 'trunc' node.
32142 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
32143 Op.getOpcode() == ISD::TRUNCATE)
32144 Op = Op.getOperand(0);
32145 // A special case for rdrand/rdseed, where 0 is set if false cond is
32147 if ((Op.getOpcode() != X86ISD::RDRAND &&
32148 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
32151 // Quit if false value is not the constant 0 or 1.
32152 bool FValIsFalse = true;
32153 if (FVal && FVal->getZExtValue() != 0) {
32154 if (FVal->getZExtValue() != 1)
32156 // If FVal is 1, opposite cond is needed.
32157 needOppositeCond = !needOppositeCond;
32158 FValIsFalse = false;
32160 // Quit if TVal is not the constant opposite of FVal.
32161 if (FValIsFalse && TVal->getZExtValue() != 1)
32163 if (!FValIsFalse && TVal->getZExtValue() != 0)
32165 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
32166 if (needOppositeCond)
32167 CC = X86::GetOppositeBranchCondition(CC);
32168 return SetCC.getOperand(3);
32175 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
32177 /// (X86or (X86setcc) (X86setcc))
32178 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
32179 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
32180 X86::CondCode &CC1, SDValue &Flags,
32182 if (Cond->getOpcode() == X86ISD::CMP) {
32183 if (!isNullConstant(Cond->getOperand(1)))
32186 Cond = Cond->getOperand(0);
32191 SDValue SetCC0, SetCC1;
32192 switch (Cond->getOpcode()) {
32193 default: return false;
32200 SetCC0 = Cond->getOperand(0);
32201 SetCC1 = Cond->getOperand(1);
32205 // Make sure we have SETCC nodes, using the same flags value.
32206 if (SetCC0.getOpcode() != X86ISD::SETCC ||
32207 SetCC1.getOpcode() != X86ISD::SETCC ||
32208 SetCC0->getOperand(1) != SetCC1->getOperand(1))
32211 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
32212 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
32213 Flags = SetCC0->getOperand(1);
32217 // When legalizing carry, we create carries via add X, -1
32218 // If that comes from an actual carry, via setcc, we use the
32220 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
32221 if (EFLAGS.getOpcode() == X86ISD::ADD) {
32222 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
32223 SDValue Carry = EFLAGS.getOperand(0);
32224 while (Carry.getOpcode() == ISD::TRUNCATE ||
32225 Carry.getOpcode() == ISD::ZERO_EXTEND ||
32226 Carry.getOpcode() == ISD::SIGN_EXTEND ||
32227 Carry.getOpcode() == ISD::ANY_EXTEND ||
32228 (Carry.getOpcode() == ISD::AND &&
32229 isOneConstant(Carry.getOperand(1))))
32230 Carry = Carry.getOperand(0);
32231 if (Carry.getOpcode() == X86ISD::SETCC ||
32232 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
32233 if (Carry.getConstantOperandVal(0) == X86::COND_B)
32234 return Carry.getOperand(1);
32242 /// Optimize an EFLAGS definition used according to the condition code \p CC
32243 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
32244 /// uses of chain values.
32245 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
32247 const X86Subtarget &Subtarget) {
32248 if (CC == X86::COND_B)
32249 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
32252 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
32254 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
32257 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
32258 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
32259 TargetLowering::DAGCombinerInfo &DCI,
32260 const X86Subtarget &Subtarget) {
32263 SDValue FalseOp = N->getOperand(0);
32264 SDValue TrueOp = N->getOperand(1);
32265 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
32266 SDValue Cond = N->getOperand(3);
32268 if (CC == X86::COND_E || CC == X86::COND_NE) {
32269 switch (Cond.getOpcode()) {
32273 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
32274 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
32275 return (CC == X86::COND_E) ? FalseOp : TrueOp;
32279 // Try to simplify the EFLAGS and condition code operands.
32280 // We can't always do this as FCMOV only supports a subset of X86 cond.
32281 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
32282 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
32283 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
32285 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32289 // If this is a select between two integer constants, try to do some
32290 // optimizations. Note that the operands are ordered the opposite of SELECT
32292 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
32293 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
32294 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
32295 // larger than FalseC (the false value).
32296 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
32297 CC = X86::GetOppositeBranchCondition(CC);
32298 std::swap(TrueC, FalseC);
32299 std::swap(TrueOp, FalseOp);
32302 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
32303 // This is efficient for any integer data type (including i8/i16) and
32305 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
32306 Cond = getSETCC(CC, Cond, DL, DAG);
32308 // Zero extend the condition if needed.
32309 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
32311 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
32312 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
32313 DAG.getConstant(ShAmt, DL, MVT::i8));
32317 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
32318 // for any integer data type, including i8/i16.
32319 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
32320 Cond = getSETCC(CC, Cond, DL, DAG);
32322 // Zero extend the condition if needed.
32323 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
32324 FalseC->getValueType(0), Cond);
32325 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
32326 SDValue(FalseC, 0));
32330 // Optimize cases that will turn into an LEA instruction. This requires
32331 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
32332 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
32333 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
32334 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
32336 bool isFastMultiplier = false;
32338 switch ((unsigned char)Diff) {
32340 case 1: // result = add base, cond
32341 case 2: // result = lea base( , cond*2)
32342 case 3: // result = lea base(cond, cond*2)
32343 case 4: // result = lea base( , cond*4)
32344 case 5: // result = lea base(cond, cond*4)
32345 case 8: // result = lea base( , cond*8)
32346 case 9: // result = lea base(cond, cond*8)
32347 isFastMultiplier = true;
32352 if (isFastMultiplier) {
32353 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
32354 Cond = getSETCC(CC, Cond, DL ,DAG);
32355 // Zero extend the condition if needed.
32356 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
32358 // Scale the condition by the difference.
32360 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
32361 DAG.getConstant(Diff, DL, Cond.getValueType()));
32363 // Add the base if non-zero.
32364 if (FalseC->getAPIntValue() != 0)
32365 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
32366 SDValue(FalseC, 0));
32373 // Handle these cases:
32374 // (select (x != c), e, c) -> select (x != c), e, x),
32375 // (select (x == c), c, e) -> select (x == c), x, e)
32376 // where the c is an integer constant, and the "select" is the combination
32377 // of CMOV and CMP.
32379 // The rationale for this change is that the conditional-move from a constant
32380 // needs two instructions, however, conditional-move from a register needs
32381 // only one instruction.
32383 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
32384 // some instruction-combining opportunities. This opt needs to be
32385 // postponed as late as possible.
32387 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
32388 // the DCI.xxxx conditions are provided to postpone the optimization as
32389 // late as possible.
32391 ConstantSDNode *CmpAgainst = nullptr;
32392 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
32393 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
32394 !isa<ConstantSDNode>(Cond.getOperand(0))) {
32396 if (CC == X86::COND_NE &&
32397 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
32398 CC = X86::GetOppositeBranchCondition(CC);
32399 std::swap(TrueOp, FalseOp);
32402 if (CC == X86::COND_E &&
32403 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
32404 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
32405 DAG.getConstant(CC, DL, MVT::i8), Cond };
32406 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32411 // Fold and/or of setcc's to double CMOV:
32412 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
32413 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
32415 // This combine lets us generate:
32416 // cmovcc1 (jcc1 if we don't have CMOV)
32422 // cmovne (jne if we don't have CMOV)
32423 // When we can't use the CMOV instruction, it might increase branch
32425 // When we can use CMOV, or when there is no mispredict, this improves
32426 // throughput and reduces register pressure.
32428 if (CC == X86::COND_NE) {
32430 X86::CondCode CC0, CC1;
32432 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
32434 std::swap(FalseOp, TrueOp);
32435 CC0 = X86::GetOppositeBranchCondition(CC0);
32436 CC1 = X86::GetOppositeBranchCondition(CC1);
32439 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
32441 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
32442 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
32443 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32451 /// Different mul shrinking modes.
32452 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
32454 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
32455 EVT VT = N->getOperand(0).getValueType();
32456 if (VT.getScalarSizeInBits() != 32)
32459 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
32460 unsigned SignBits[2] = {1, 1};
32461 bool IsPositive[2] = {false, false};
32462 for (unsigned i = 0; i < 2; i++) {
32463 SDValue Opd = N->getOperand(i);
32465 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
32466 // compute signbits for it separately.
32467 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
32468 // For anyextend, it is safe to assume an appropriate number of leading
32470 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
32472 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
32477 IsPositive[i] = true;
32478 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
32479 // All the operands of BUILD_VECTOR need to be int constant.
32480 // Find the smallest value range which all the operands belong to.
32482 IsPositive[i] = true;
32483 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
32484 if (SubOp.isUndef())
32486 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
32489 APInt IntVal = CN->getAPIntValue();
32490 if (IntVal.isNegative())
32491 IsPositive[i] = false;
32492 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
32495 SignBits[i] = DAG.ComputeNumSignBits(Opd);
32496 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
32497 IsPositive[i] = true;
32501 bool AllPositive = IsPositive[0] && IsPositive[1];
32502 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
32503 // When ranges are from -128 ~ 127, use MULS8 mode.
32504 if (MinSignBits >= 25)
32506 // When ranges are from 0 ~ 255, use MULU8 mode.
32507 else if (AllPositive && MinSignBits >= 24)
32509 // When ranges are from -32768 ~ 32767, use MULS16 mode.
32510 else if (MinSignBits >= 17)
32512 // When ranges are from 0 ~ 65535, use MULU16 mode.
32513 else if (AllPositive && MinSignBits >= 16)
32520 /// When the operands of vector mul are extended from smaller size values,
32521 /// like i8 and i16, the type of mul may be shrinked to generate more
32522 /// efficient code. Two typical patterns are handled:
32524 /// %2 = sext/zext <N x i8> %1 to <N x i32>
32525 /// %4 = sext/zext <N x i8> %3 to <N x i32>
32526 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32527 /// %5 = mul <N x i32> %2, %4
32530 /// %2 = zext/sext <N x i16> %1 to <N x i32>
32531 /// %4 = zext/sext <N x i16> %3 to <N x i32>
32532 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32533 /// %5 = mul <N x i32> %2, %4
32535 /// There are four mul shrinking modes:
32536 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
32537 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
32538 /// generate pmullw+sext32 for it (MULS8 mode).
32539 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
32540 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
32541 /// generate pmullw+zext32 for it (MULU8 mode).
32542 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
32543 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
32544 /// generate pmullw+pmulhw for it (MULS16 mode).
32545 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
32546 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
32547 /// generate pmullw+pmulhuw for it (MULU16 mode).
32548 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
32549 const X86Subtarget &Subtarget) {
32550 // Check for legality
32551 // pmullw/pmulhw are not supported by SSE.
32552 if (!Subtarget.hasSSE2())
32555 // Check for profitability
32556 // pmulld is supported since SSE41. It is better to use pmulld
32557 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
32559 bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
32560 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
32564 if (!canReduceVMulWidth(N, DAG, Mode))
32568 SDValue N0 = N->getOperand(0);
32569 SDValue N1 = N->getOperand(1);
32570 EVT VT = N->getOperand(0).getValueType();
32571 unsigned NumElts = VT.getVectorNumElements();
32572 if ((NumElts % 2) != 0)
32575 // If the upper 17 bits of each element are zero then we can use PMADD.
32576 APInt Mask17 = APInt::getHighBitsSet(32, 17);
32577 if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) &&
32578 DAG.MaskedValueIsZero(N1, Mask17))
32579 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0),
32580 DAG.getBitcast(MVT::v8i16, N1));
32582 unsigned RegSize = 128;
32583 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
32584 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
32586 // Shrink the operands of mul.
32587 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
32588 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
32590 if (NumElts >= OpsVT.getVectorNumElements()) {
32591 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
32592 // lower part is needed.
32593 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
32594 if (Mode == MULU8 || Mode == MULS8) {
32595 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
32598 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
32599 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
32600 // the higher part is also needed.
32601 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32602 ReducedVT, NewN0, NewN1);
32604 // Repack the lower part and higher part result of mul into a wider
32606 // Generate shuffle functioning as punpcklwd.
32607 SmallVector<int, 16> ShuffleMask(NumElts);
32608 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
32609 ShuffleMask[2 * i] = i;
32610 ShuffleMask[2 * i + 1] = i + NumElts;
32613 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
32614 ResLo = DAG.getBitcast(ResVT, ResLo);
32615 // Generate shuffle functioning as punpckhwd.
32616 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
32617 ShuffleMask[2 * i] = i + NumElts / 2;
32618 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
32621 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
32622 ResHi = DAG.getBitcast(ResVT, ResHi);
32623 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
32626 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
32627 // to legalize the mul explicitly because implicit legalization for type
32628 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
32629 // instructions which will not exist when we explicitly legalize it by
32630 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
32631 // <4 x i16> undef).
32633 // Legalize the operands of mul.
32634 // FIXME: We may be able to handle non-concatenated vectors by insertion.
32635 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
32636 if ((RegSize % ReducedSizeInBits) != 0)
32639 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
32640 DAG.getUNDEF(ReducedVT));
32642 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
32644 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
32646 if (Mode == MULU8 || Mode == MULS8) {
32647 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
32649 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
32651 // convert the type of mul result to VT.
32652 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
32653 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
32654 : ISD::SIGN_EXTEND_VECTOR_INREG,
32656 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
32657 DAG.getIntPtrConstant(0, DL));
32659 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
32660 // MULU16/MULS16, both parts are needed.
32661 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
32662 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32663 OpsVT, NewN0, NewN1);
32665 // Repack the lower part and higher part result of mul into a wider
32666 // result. Make sure the type of mul result is VT.
32667 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
32668 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
32669 Res = DAG.getBitcast(ResVT, Res);
32670 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
32671 DAG.getIntPtrConstant(0, DL));
32676 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
32677 EVT VT, SDLoc DL) {
32679 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
32680 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32681 DAG.getConstant(Mult, DL, VT));
32682 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
32683 DAG.getConstant(Shift, DL, MVT::i8));
32684 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
32689 auto combineMulMulAddOrSub = [&](bool isAdd) {
32690 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32691 DAG.getConstant(9, DL, VT));
32692 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
32693 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
32702 // mul x, 11 => add ((shl (mul x, 5), 1), x)
32703 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
32705 // mul x, 21 => add ((shl (mul x, 5), 2), x)
32706 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
32708 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
32709 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32710 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
32712 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
32713 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
32715 // mul x, 13 => add ((shl (mul x, 3), 2), x)
32716 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
32718 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
32719 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
32721 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
32722 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32723 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
32725 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
32726 return combineMulMulAddOrSub(/*isAdd*/ false);
32728 // mul x, 28 => add ((mul (mul x, 9), 3), x)
32729 return combineMulMulAddOrSub(/*isAdd*/ true);
32731 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
32732 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32733 combineMulMulAddOrSub(/*isAdd*/ true));
32735 // mul x, 30 => sub (sub ((shl x, 5), x), x)
32736 return DAG.getNode(
32738 DAG.getNode(ISD::SUB, DL, VT,
32739 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32740 DAG.getConstant(5, DL, MVT::i8)),
32747 /// Optimize a single multiply with constant into two operations in order to
32748 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
32749 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
32750 TargetLowering::DAGCombinerInfo &DCI,
32751 const X86Subtarget &Subtarget) {
32752 EVT VT = N->getValueType(0);
32753 if (DCI.isBeforeLegalize() && VT.isVector())
32754 return reduceVMULWidth(N, DAG, Subtarget);
32756 if (!MulConstantOptimization)
32758 // An imul is usually smaller than the alternative sequence.
32759 if (DAG.getMachineFunction().getFunction().optForMinSize())
32762 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
32765 if (VT != MVT::i64 && VT != MVT::i32)
32768 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
32771 uint64_t MulAmt = C->getZExtValue();
32772 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
32775 uint64_t MulAmt1 = 0;
32776 uint64_t MulAmt2 = 0;
32777 if ((MulAmt % 9) == 0) {
32779 MulAmt2 = MulAmt / 9;
32780 } else if ((MulAmt % 5) == 0) {
32782 MulAmt2 = MulAmt / 5;
32783 } else if ((MulAmt % 3) == 0) {
32785 MulAmt2 = MulAmt / 3;
32791 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
32793 if (isPowerOf2_64(MulAmt2) &&
32794 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
32795 // If second multiplifer is pow2, issue it first. We want the multiply by
32796 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
32798 std::swap(MulAmt1, MulAmt2);
32800 if (isPowerOf2_64(MulAmt1))
32801 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32802 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
32804 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32805 DAG.getConstant(MulAmt1, DL, VT));
32807 if (isPowerOf2_64(MulAmt2))
32808 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
32809 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
32811 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
32812 DAG.getConstant(MulAmt2, DL, VT));
32813 } else if (!Subtarget.slowLEA())
32814 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
32817 assert(MulAmt != 0 &&
32818 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
32819 "Both cases that could cause potential overflows should have "
32820 "already been handled.");
32821 int64_t SignMulAmt = C->getSExtValue();
32822 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
32823 (SignMulAmt != -INT64_MAX)) {
32824 int NumSign = SignMulAmt > 0 ? 1 : -1;
32825 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
32826 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
32827 if (IsPowerOf2_64PlusOne) {
32828 // (mul x, 2^N + 1) => (add (shl x, N), x)
32829 NewMul = DAG.getNode(
32830 ISD::ADD, DL, VT, N->getOperand(0),
32831 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32832 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
32834 } else if (IsPowerOf2_64MinusOne) {
32835 // (mul x, 2^N - 1) => (sub (shl x, N), x)
32836 NewMul = DAG.getNode(
32838 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32839 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
32843 // To negate, subtract the number from zero
32844 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
32846 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
32851 // Do not add new nodes to DAG combiner worklist.
32852 DCI.CombineTo(N, NewMul, false);
32857 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
32858 SDValue N0 = N->getOperand(0);
32859 SDValue N1 = N->getOperand(1);
32860 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
32861 EVT VT = N0.getValueType();
32863 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
32864 // since the result of setcc_c is all zero's or all ones.
32865 if (VT.isInteger() && !VT.isVector() &&
32866 N1C && N0.getOpcode() == ISD::AND &&
32867 N0.getOperand(1).getOpcode() == ISD::Constant) {
32868 SDValue N00 = N0.getOperand(0);
32869 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
32870 Mask <<= N1C->getAPIntValue();
32871 bool MaskOK = false;
32872 // We can handle cases concerning bit-widening nodes containing setcc_c if
32873 // we carefully interrogate the mask to make sure we are semantics
32875 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
32876 // of the underlying setcc_c operation if the setcc_c was zero extended.
32877 // Consider the following example:
32878 // zext(setcc_c) -> i32 0x0000FFFF
32879 // c1 -> i32 0x0000FFFF
32880 // c2 -> i32 0x00000001
32881 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
32882 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
32883 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
32885 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
32886 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
32888 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
32889 N00.getOpcode() == ISD::ANY_EXTEND) &&
32890 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
32891 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
32893 if (MaskOK && Mask != 0) {
32895 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
32899 // Hardware support for vector shifts is sparse which makes us scalarize the
32900 // vector operations in many cases. Also, on sandybridge ADD is faster than
32902 // (shl V, 1) -> add V,V
32903 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
32904 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
32905 assert(N0.getValueType().isVector() && "Invalid vector shift type");
32906 // We shift all of the values by one. In many cases we do not have
32907 // hardware support for this operation. This is better expressed as an ADD
32909 if (N1SplatC->getAPIntValue() == 1)
32910 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
32916 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
32917 SDValue N0 = N->getOperand(0);
32918 SDValue N1 = N->getOperand(1);
32919 EVT VT = N0.getValueType();
32920 unsigned Size = VT.getSizeInBits();
32922 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
32923 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
32924 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
32925 // depending on sign of (SarConst - [56,48,32,24,16])
32927 // sexts in X86 are MOVs. The MOVs have the same code size
32928 // as above SHIFTs (only SHIFT on 1 has lower code size).
32929 // However the MOVs have 2 advantages to a SHIFT:
32930 // 1. MOVs can write to a register that differs from source
32931 // 2. MOVs accept memory operands
32933 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
32934 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
32935 N0.getOperand(1).getOpcode() != ISD::Constant)
32938 SDValue N00 = N0.getOperand(0);
32939 SDValue N01 = N0.getOperand(1);
32940 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
32941 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
32942 EVT CVT = N1.getValueType();
32944 if (SarConst.isNegative())
32947 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
32948 unsigned ShiftSize = SVT.getSizeInBits();
32949 // skipping types without corresponding sext/zext and
32950 // ShlConst that is not one of [56,48,32,24,16]
32951 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
32955 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
32956 SarConst = SarConst - (Size - ShiftSize);
32959 else if (SarConst.isNegative())
32960 return DAG.getNode(ISD::SHL, DL, VT, NN,
32961 DAG.getConstant(-SarConst, DL, CVT));
32963 return DAG.getNode(ISD::SRA, DL, VT, NN,
32964 DAG.getConstant(SarConst, DL, CVT));
32969 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
32970 SDValue N0 = N->getOperand(0);
32971 SDValue N1 = N->getOperand(1);
32972 EVT VT = N0.getValueType();
32974 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
32975 // TODO: This is a generic DAG combine that became an x86-only combine to
32976 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
32977 // and-not ('andn').
32978 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
32981 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
32982 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
32983 if (!ShiftC || !AndC)
32986 // If we can shrink the constant mask below 8-bits or 32-bits, then this
32987 // transform should reduce code size. It may also enable secondary transforms
32988 // from improved known-bits analysis or instruction selection.
32989 APInt MaskVal = AndC->getAPIntValue();
32990 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
32991 unsigned OldMaskSize = MaskVal.getMinSignedBits();
32992 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
32993 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
32994 (OldMaskSize > 32 && NewMaskSize <= 32)) {
32995 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
32997 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
32998 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
32999 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
33004 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
33005 TargetLowering::DAGCombinerInfo &DCI,
33006 const X86Subtarget &Subtarget) {
33007 if (N->getOpcode() == ISD::SHL)
33008 if (SDValue V = combineShiftLeft(N, DAG))
33011 if (N->getOpcode() == ISD::SRA)
33012 if (SDValue V = combineShiftRightArithmetic(N, DAG))
33015 if (N->getOpcode() == ISD::SRL)
33016 if (SDValue V = combineShiftRightLogical(N, DAG))
33022 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
33023 TargetLowering::DAGCombinerInfo &DCI,
33024 const X86Subtarget &Subtarget) {
33025 unsigned Opcode = N->getOpcode();
33026 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
33027 "Unexpected shift opcode");
33029 EVT VT = N->getValueType(0);
33030 SDValue N0 = N->getOperand(0);
33031 SDValue N1 = N->getOperand(1);
33032 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
33033 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
33034 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
33035 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
33036 "Unexpected PACKSS/PACKUS input type");
33038 // Constant Folding.
33039 APInt UndefElts0, UndefElts1;
33040 SmallVector<APInt, 32> EltBits0, EltBits1;
33041 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
33042 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
33043 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
33044 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
33045 unsigned NumLanes = VT.getSizeInBits() / 128;
33046 unsigned NumDstElts = VT.getVectorNumElements();
33047 unsigned NumSrcElts = NumDstElts / 2;
33048 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
33049 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
33050 bool IsSigned = (X86ISD::PACKSS == Opcode);
33052 APInt Undefs(NumDstElts, 0);
33053 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
33054 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
33055 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
33056 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
33057 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
33058 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
33060 if (UndefElts[SrcIdx]) {
33061 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
33065 APInt &Val = EltBits[SrcIdx];
33067 // PACKSS: Truncate signed value with signed saturation.
33068 // Source values less than dst minint are saturated to minint.
33069 // Source values greater than dst maxint are saturated to maxint.
33070 if (Val.isSignedIntN(DstBitsPerElt))
33071 Val = Val.trunc(DstBitsPerElt);
33072 else if (Val.isNegative())
33073 Val = APInt::getSignedMinValue(DstBitsPerElt);
33075 Val = APInt::getSignedMaxValue(DstBitsPerElt);
33077 // PACKUS: Truncate signed value with unsigned saturation.
33078 // Source values less than zero are saturated to zero.
33079 // Source values greater than dst maxuint are saturated to maxuint.
33080 if (Val.isIntN(DstBitsPerElt))
33081 Val = Val.trunc(DstBitsPerElt);
33082 else if (Val.isNegative())
33083 Val = APInt::getNullValue(DstBitsPerElt);
33085 Val = APInt::getAllOnesValue(DstBitsPerElt);
33087 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
33091 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
33094 // Attempt to combine as shuffle.
33096 if (SDValue Res = combineX86ShufflesRecursively(
33097 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33098 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33099 DCI.CombineTo(N, Res);
33106 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
33107 TargetLowering::DAGCombinerInfo &DCI,
33108 const X86Subtarget &Subtarget) {
33109 unsigned Opcode = N->getOpcode();
33110 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
33111 X86ISD::VSRLI == Opcode) &&
33112 "Unexpected shift opcode");
33113 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
33114 EVT VT = N->getValueType(0);
33115 SDValue N0 = N->getOperand(0);
33116 SDValue N1 = N->getOperand(1);
33117 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
33118 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
33119 "Unexpected value type");
33121 // Out of range logical bit shifts are guaranteed to be zero.
33122 // Out of range arithmetic bit shifts splat the sign bit.
33123 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
33124 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
33126 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
33128 ShiftVal = NumBitsPerElt - 1;
33131 // Shift N0 by zero -> N0.
33135 // Shift zero -> zero.
33136 if (ISD::isBuildVectorAllZeros(N0.getNode()))
33137 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
33139 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
33140 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
33141 // TODO - support other sra opcodes as needed.
33142 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
33143 N0.getOpcode() == X86ISD::VSRAI)
33144 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
33146 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
33147 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
33148 N1 == N0.getOperand(1)) {
33149 SDValue N00 = N0.getOperand(0);
33150 unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
33151 if (ShiftVal.ult(NumSignBits))
33155 // We can decode 'whole byte' logical bit shifts as shuffles.
33156 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
33158 if (SDValue Res = combineX86ShufflesRecursively(
33159 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33160 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33161 DCI.CombineTo(N, Res);
33166 // Constant Folding.
33168 SmallVector<APInt, 32> EltBits;
33169 if (N->isOnlyUserOf(N0.getNode()) &&
33170 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
33171 assert(EltBits.size() == VT.getVectorNumElements() &&
33172 "Unexpected shift value type");
33173 unsigned ShiftImm = ShiftVal.getZExtValue();
33174 for (APInt &Elt : EltBits) {
33175 if (X86ISD::VSHLI == Opcode)
33177 else if (X86ISD::VSRAI == Opcode)
33178 Elt.ashrInPlace(ShiftImm);
33180 Elt.lshrInPlace(ShiftImm);
33182 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
33188 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
33189 TargetLowering::DAGCombinerInfo &DCI,
33190 const X86Subtarget &Subtarget) {
33192 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
33193 (N->getOpcode() == X86ISD::PINSRW &&
33194 N->getValueType(0) == MVT::v8i16)) &&
33195 "Unexpected vector insertion");
33197 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
33199 if (SDValue Res = combineX86ShufflesRecursively(
33200 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33201 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33202 DCI.CombineTo(N, Res);
33209 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
33210 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
33211 /// OR -> CMPNEQSS.
33212 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
33213 TargetLowering::DAGCombinerInfo &DCI,
33214 const X86Subtarget &Subtarget) {
33217 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
33218 // we're requiring SSE2 for both.
33219 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
33220 SDValue N0 = N->getOperand(0);
33221 SDValue N1 = N->getOperand(1);
33222 SDValue CMP0 = N0->getOperand(1);
33223 SDValue CMP1 = N1->getOperand(1);
33226 // The SETCCs should both refer to the same CMP.
33227 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
33230 SDValue CMP00 = CMP0->getOperand(0);
33231 SDValue CMP01 = CMP0->getOperand(1);
33232 EVT VT = CMP00.getValueType();
33234 if (VT == MVT::f32 || VT == MVT::f64) {
33235 bool ExpectingFlags = false;
33236 // Check for any users that want flags:
33237 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
33238 !ExpectingFlags && UI != UE; ++UI)
33239 switch (UI->getOpcode()) {
33244 ExpectingFlags = true;
33246 case ISD::CopyToReg:
33247 case ISD::SIGN_EXTEND:
33248 case ISD::ZERO_EXTEND:
33249 case ISD::ANY_EXTEND:
33253 if (!ExpectingFlags) {
33254 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
33255 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
33257 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
33258 X86::CondCode tmp = cc0;
33263 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
33264 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
33265 // FIXME: need symbolic constants for these magic numbers.
33266 // See X86ATTInstPrinter.cpp:printSSECC().
33267 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
33268 if (Subtarget.hasAVX512()) {
33270 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
33271 DAG.getConstant(x86cc, DL, MVT::i8));
33272 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
33273 N->getSimpleValueType(0), FSetCC,
33274 DAG.getIntPtrConstant(0, DL));
33276 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
33277 CMP00.getValueType(), CMP00, CMP01,
33278 DAG.getConstant(x86cc, DL,
33281 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
33282 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
33284 if (is64BitFP && !Subtarget.is64Bit()) {
33285 // On a 32-bit target, we cannot bitcast the 64-bit float to a
33286 // 64-bit integer, since that's not a legal type. Since
33287 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
33288 // bits, but can do this little dance to extract the lowest 32 bits
33289 // and work with those going forward.
33290 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
33292 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
33293 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
33294 Vector32, DAG.getIntPtrConstant(0, DL));
33298 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
33299 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
33300 DAG.getConstant(1, DL, IntVT));
33301 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
33303 return OneBitOfTruth;
33311 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
33312 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
33313 assert(N->getOpcode() == ISD::AND);
33315 EVT VT = N->getValueType(0);
33316 SDValue N0 = N->getOperand(0);
33317 SDValue N1 = N->getOperand(1);
33320 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
33323 if (N0.getOpcode() == ISD::XOR &&
33324 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
33325 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
33327 if (N1.getOpcode() == ISD::XOR &&
33328 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
33329 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
33334 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
33335 // register. In most cases we actually compare or select YMM-sized registers
33336 // and mixing the two types creates horrible code. This method optimizes
33337 // some of the transition sequences.
33338 // Even with AVX-512 this is still useful for removing casts around logical
33339 // operations on vXi1 mask types.
33340 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
33341 TargetLowering::DAGCombinerInfo &DCI,
33342 const X86Subtarget &Subtarget) {
33343 EVT VT = N->getValueType(0);
33344 assert(VT.isVector() && "Expected vector type");
33346 assert((N->getOpcode() == ISD::ANY_EXTEND ||
33347 N->getOpcode() == ISD::ZERO_EXTEND ||
33348 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
33350 SDValue Narrow = N->getOperand(0);
33351 EVT NarrowVT = Narrow.getValueType();
33353 if (Narrow->getOpcode() != ISD::XOR &&
33354 Narrow->getOpcode() != ISD::AND &&
33355 Narrow->getOpcode() != ISD::OR)
33358 SDValue N0 = Narrow->getOperand(0);
33359 SDValue N1 = Narrow->getOperand(1);
33362 // The Left side has to be a trunc.
33363 if (N0.getOpcode() != ISD::TRUNCATE)
33366 // The type of the truncated inputs.
33367 if (N0->getOperand(0).getValueType() != VT)
33370 // The right side has to be a 'trunc' or a constant vector.
33371 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
33372 N1.getOperand(0).getValueType() == VT;
33374 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
33377 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33379 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
33382 // Set N0 and N1 to hold the inputs to the new wide operation.
33383 N0 = N0->getOperand(0);
33385 N1 = N1->getOperand(0);
33387 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
33389 // Generate the wide operation.
33390 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
33391 unsigned Opcode = N->getOpcode();
33393 default: llvm_unreachable("Unexpected opcode");
33394 case ISD::ANY_EXTEND:
33396 case ISD::ZERO_EXTEND:
33397 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
33398 case ISD::SIGN_EXTEND:
33399 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
33400 Op, DAG.getValueType(NarrowVT));
33404 /// If both input operands of a logic op are being cast from floating point
33405 /// types, try to convert this into a floating point logic node to avoid
33406 /// unnecessary moves from SSE to integer registers.
33407 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
33408 const X86Subtarget &Subtarget) {
33409 unsigned FPOpcode = ISD::DELETED_NODE;
33410 if (N->getOpcode() == ISD::AND)
33411 FPOpcode = X86ISD::FAND;
33412 else if (N->getOpcode() == ISD::OR)
33413 FPOpcode = X86ISD::FOR;
33414 else if (N->getOpcode() == ISD::XOR)
33415 FPOpcode = X86ISD::FXOR;
33417 assert(FPOpcode != ISD::DELETED_NODE &&
33418 "Unexpected input node for FP logic conversion");
33420 EVT VT = N->getValueType(0);
33421 SDValue N0 = N->getOperand(0);
33422 SDValue N1 = N->getOperand(1);
33424 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
33425 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
33426 (Subtarget.hasSSE2() && VT == MVT::i64))) {
33427 SDValue N00 = N0.getOperand(0);
33428 SDValue N10 = N1.getOperand(0);
33429 EVT N00Type = N00.getValueType();
33430 EVT N10Type = N10.getValueType();
33431 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
33432 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
33433 return DAG.getBitcast(VT, FPLogic);
33439 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
33440 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
33441 /// with a shift-right to eliminate loading the vector constant mask value.
33442 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
33443 const X86Subtarget &Subtarget) {
33444 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
33445 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
33446 EVT VT0 = Op0.getValueType();
33447 EVT VT1 = Op1.getValueType();
33449 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
33453 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
33454 !SplatVal.isMask())
33457 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
33460 unsigned EltBitWidth = VT0.getScalarSizeInBits();
33461 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
33465 unsigned ShiftVal = SplatVal.countTrailingOnes();
33466 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
33467 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
33468 return DAG.getBitcast(N->getValueType(0), Shift);
33471 // Get the index node from the lowered DAG of a GEP IR instruction with one
33472 // indexing dimension.
33473 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
33474 if (Ld->isIndexed())
33477 SDValue Base = Ld->getBasePtr();
33479 if (Base.getOpcode() != ISD::ADD)
33482 SDValue ShiftedIndex = Base.getOperand(0);
33484 if (ShiftedIndex.getOpcode() != ISD::SHL)
33487 return ShiftedIndex.getOperand(0);
33491 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
33492 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
33493 switch (VT.getSizeInBits()) {
33494 default: return false;
33495 case 64: return Subtarget.is64Bit() ? true : false;
33496 case 32: return true;
33502 // This function recognizes cases where X86 bzhi instruction can replace and
33503 // 'and-load' sequence.
33504 // In case of loading integer value from an array of constants which is defined
33507 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
33509 // then applying a bitwise and on the result with another input.
33510 // It's equivalent to performing bzhi (zero high bits) on the input, with the
33511 // same index of the load.
33512 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
33513 const X86Subtarget &Subtarget) {
33514 MVT VT = Node->getSimpleValueType(0);
33517 // Check if subtarget has BZHI instruction for the node's type
33518 if (!hasBZHI(Subtarget, VT))
33521 // Try matching the pattern for both operands.
33522 for (unsigned i = 0; i < 2; i++) {
33523 SDValue N = Node->getOperand(i);
33524 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
33526 // continue if the operand is not a load instruction
33530 const Value *MemOp = Ld->getMemOperand()->getValue();
33535 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
33536 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
33537 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
33539 Constant *Init = GV->getInitializer();
33540 Type *Ty = Init->getType();
33541 if (!isa<ConstantDataArray>(Init) ||
33542 !Ty->getArrayElementType()->isIntegerTy() ||
33543 Ty->getArrayElementType()->getScalarSizeInBits() !=
33544 VT.getSizeInBits() ||
33545 Ty->getArrayNumElements() >
33546 Ty->getArrayElementType()->getScalarSizeInBits())
33549 // Check if the array's constant elements are suitable to our case.
33550 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
33551 bool ConstantsMatch = true;
33552 for (uint64_t j = 0; j < ArrayElementCount; j++) {
33553 ConstantInt *Elem =
33554 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
33555 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
33556 ConstantsMatch = false;
33560 if (!ConstantsMatch)
33563 // Do the transformation (For 32-bit type):
33564 // -> (and (load arr[idx]), inp)
33565 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
33566 // that will be replaced with one bzhi instruction.
33567 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
33568 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
33570 // Get the Node which indexes into the array.
33571 SDValue Index = getIndexFromUnindexedLoad(Ld);
33574 Index = DAG.getZExtOrTrunc(Index, dl, VT);
33576 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
33578 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
33579 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
33581 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
33589 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
33590 TargetLowering::DAGCombinerInfo &DCI,
33591 const X86Subtarget &Subtarget) {
33592 EVT VT = N->getValueType(0);
33594 // If this is SSE1 only convert to FAND to avoid scalarization.
33595 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
33596 return DAG.getBitcast(
33597 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
33598 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
33599 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
33602 if (DCI.isBeforeLegalizeOps())
33605 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
33608 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33611 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
33614 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
33617 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
33620 // Attempt to recursively combine a bitmask AND with shuffles.
33621 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33623 if (SDValue Res = combineX86ShufflesRecursively(
33624 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33625 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33626 DCI.CombineTo(N, Res);
33631 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
33632 if ((VT.getScalarSizeInBits() % 8) == 0 &&
33633 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
33634 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
33635 SDValue BitMask = N->getOperand(1);
33636 SDValue SrcVec = N->getOperand(0).getOperand(0);
33637 EVT SrcVecVT = SrcVec.getValueType();
33639 // Check that the constant bitmask masks whole bytes.
33641 SmallVector<APInt, 64> EltBits;
33642 if (VT == SrcVecVT.getScalarType() &&
33643 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
33644 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
33645 llvm::all_of(EltBits, [](APInt M) {
33646 return M.isNullValue() || M.isAllOnesValue();
33648 unsigned NumElts = SrcVecVT.getVectorNumElements();
33649 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
33650 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
33652 // Create a root shuffle mask from the byte mask and the extracted index.
33653 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
33654 for (unsigned i = 0; i != Scale; ++i) {
33657 int VecIdx = Scale * Idx + i;
33658 ShuffleMask[VecIdx] =
33659 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
33662 if (SDValue Shuffle = combineX86ShufflesRecursively(
33663 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
33664 /*HasVarMask*/ false, DAG, DCI, Subtarget))
33665 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
33666 N->getOperand(0).getOperand(1));
33674 // (or (and (m, y), (pandn m, x)))
33676 // (vselect m, x, y)
33677 // As a special case, try to fold:
33678 // (or (and (m, (sub 0, x)), (pandn m, x)))
33680 // (sub (xor X, M), M)
33681 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
33682 const X86Subtarget &Subtarget) {
33683 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
33685 SDValue N0 = N->getOperand(0);
33686 SDValue N1 = N->getOperand(1);
33687 EVT VT = N->getValueType(0);
33689 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
33690 (VT.is256BitVector() && Subtarget.hasInt256())))
33693 // Canonicalize AND to LHS.
33694 if (N1.getOpcode() == ISD::AND)
33697 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
33698 // ANDNP combine allows other combines to happen that prevent matching.
33699 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
33702 SDValue Mask = N1.getOperand(0);
33703 SDValue X = N1.getOperand(1);
33705 if (N0.getOperand(0) == Mask)
33706 Y = N0.getOperand(1);
33707 if (N0.getOperand(1) == Mask)
33708 Y = N0.getOperand(0);
33710 // Check to see if the mask appeared in both the AND and ANDNP.
33714 // Validate that X, Y, and Mask are bitcasts, and see through them.
33715 Mask = peekThroughBitcasts(Mask);
33716 X = peekThroughBitcasts(X);
33717 Y = peekThroughBitcasts(Y);
33719 EVT MaskVT = Mask.getValueType();
33720 unsigned EltBits = MaskVT.getScalarSizeInBits();
33722 // TODO: Attempt to handle floating point cases as well?
33723 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
33729 // (or (and (M, (sub 0, X)), (pandn M, X)))
33730 // which is a special case of vselect:
33731 // (vselect M, (sub 0, X), X)
33733 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
33734 // We know that, if fNegate is 0 or 1:
33735 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
33737 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
33738 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
33739 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
33740 // This lets us transform our vselect to:
33741 // (add (xor X, M), (and M, 1))
33743 // (sub (xor X, M), M)
33744 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
33745 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
33746 auto IsNegV = [](SDNode *N, SDValue V) {
33747 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
33748 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
33751 if (IsNegV(Y.getNode(), X))
33753 else if (IsNegV(X.getNode(), Y))
33757 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
33758 SDValue SubOp2 = Mask;
33760 // If the negate was on the false side of the select, then
33761 // the operands of the SUB need to be swapped. PR 27251.
33762 // This is because the pattern being matched above is
33763 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
33764 // but if the pattern matched was
33765 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
33766 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
33767 // pattern also needs to be a negation of the replacement pattern above.
33768 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
33769 // sub accomplishes the negation of the replacement pattern.
33771 std::swap(SubOp1, SubOp2);
33773 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
33774 return DAG.getBitcast(VT, Res);
33778 // PBLENDVB is only available on SSE 4.1.
33779 if (!Subtarget.hasSSE41())
33782 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
33784 X = DAG.getBitcast(BlendVT, X);
33785 Y = DAG.getBitcast(BlendVT, Y);
33786 Mask = DAG.getBitcast(BlendVT, Mask);
33787 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
33788 return DAG.getBitcast(VT, Mask);
33791 // Helper function for combineOrCmpEqZeroToCtlzSrl
33795 // srl(ctlz x), log2(bitsize(x))
33796 // Input pattern is checked by caller.
33797 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
33798 SelectionDAG &DAG) {
33799 SDValue Cmp = Op.getOperand(1);
33800 EVT VT = Cmp.getOperand(0).getValueType();
33801 unsigned Log2b = Log2_32(VT.getSizeInBits());
33803 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
33804 // The result of the shift is true or false, and on X86, the 32-bit
33805 // encoding of shr and lzcnt is more desirable.
33806 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
33807 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
33808 DAG.getConstant(Log2b, dl, VT));
33809 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
33812 // Try to transform:
33813 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
33815 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
33816 // Will also attempt to match more generic cases, eg:
33817 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
33818 // Only applies if the target supports the FastLZCNT feature.
33819 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
33820 TargetLowering::DAGCombinerInfo &DCI,
33821 const X86Subtarget &Subtarget) {
33822 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
33825 auto isORCandidate = [](SDValue N) {
33826 return (N->getOpcode() == ISD::OR && N->hasOneUse());
33829 // Check the zero extend is extending to 32-bit or more. The code generated by
33830 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
33831 // instructions to clear the upper bits.
33832 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
33833 !isORCandidate(N->getOperand(0)))
33836 // Check the node matches: setcc(eq, cmp 0)
33837 auto isSetCCCandidate = [](SDValue N) {
33838 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
33839 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
33840 N->getOperand(1).getOpcode() == X86ISD::CMP &&
33841 isNullConstant(N->getOperand(1).getOperand(1)) &&
33842 N->getOperand(1).getValueType().bitsGE(MVT::i32);
33845 SDNode *OR = N->getOperand(0).getNode();
33846 SDValue LHS = OR->getOperand(0);
33847 SDValue RHS = OR->getOperand(1);
33849 // Save nodes matching or(or, setcc(eq, cmp 0)).
33850 SmallVector<SDNode *, 2> ORNodes;
33851 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
33852 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
33853 ORNodes.push_back(OR);
33854 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
33855 LHS = OR->getOperand(0);
33856 RHS = OR->getOperand(1);
33859 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
33860 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
33861 !isORCandidate(SDValue(OR, 0)))
33864 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
33866 // or(srl(ctlz),srl(ctlz)).
33867 // The dag combiner can then fold it into:
33868 // srl(or(ctlz, ctlz)).
33869 EVT VT = OR->getValueType(0);
33870 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
33871 SDValue Ret, NewRHS;
33872 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
33873 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
33878 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
33879 while (ORNodes.size() > 0) {
33880 OR = ORNodes.pop_back_val();
33881 LHS = OR->getOperand(0);
33882 RHS = OR->getOperand(1);
33883 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
33884 if (RHS->getOpcode() == ISD::OR)
33885 std::swap(LHS, RHS);
33886 EVT VT = OR->getValueType(0);
33887 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
33890 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
33894 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
33899 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
33900 TargetLowering::DAGCombinerInfo &DCI,
33901 const X86Subtarget &Subtarget) {
33902 SDValue N0 = N->getOperand(0);
33903 SDValue N1 = N->getOperand(1);
33904 EVT VT = N->getValueType(0);
33906 // If this is SSE1 only convert to FOR to avoid scalarization.
33907 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
33908 return DAG.getBitcast(MVT::v4i32,
33909 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
33910 DAG.getBitcast(MVT::v4f32, N0),
33911 DAG.getBitcast(MVT::v4f32, N1)));
33914 if (DCI.isBeforeLegalizeOps())
33917 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
33920 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33923 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
33926 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
33929 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
33930 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
33932 // SHLD/SHRD instructions have lower register pressure, but on some
33933 // platforms they have higher latency than the equivalent
33934 // series of shifts/or that would otherwise be generated.
33935 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
33936 // have higher latencies and we are not optimizing for size.
33937 if (!OptForSize && Subtarget.isSHLDSlow())
33940 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
33942 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
33944 if (!N0.hasOneUse() || !N1.hasOneUse())
33947 SDValue ShAmt0 = N0.getOperand(1);
33948 if (ShAmt0.getValueType() != MVT::i8)
33950 SDValue ShAmt1 = N1.getOperand(1);
33951 if (ShAmt1.getValueType() != MVT::i8)
33953 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
33954 ShAmt0 = ShAmt0.getOperand(0);
33955 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
33956 ShAmt1 = ShAmt1.getOperand(0);
33959 unsigned Opc = X86ISD::SHLD;
33960 SDValue Op0 = N0.getOperand(0);
33961 SDValue Op1 = N1.getOperand(0);
33962 if (ShAmt0.getOpcode() == ISD::SUB ||
33963 ShAmt0.getOpcode() == ISD::XOR) {
33964 Opc = X86ISD::SHRD;
33965 std::swap(Op0, Op1);
33966 std::swap(ShAmt0, ShAmt1);
33969 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
33970 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
33971 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
33972 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
33973 unsigned Bits = VT.getSizeInBits();
33974 if (ShAmt1.getOpcode() == ISD::SUB) {
33975 SDValue Sum = ShAmt1.getOperand(0);
33976 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
33977 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
33978 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
33979 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
33980 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
33981 return DAG.getNode(Opc, DL, VT,
33983 DAG.getNode(ISD::TRUNCATE, DL,
33986 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
33987 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
33988 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
33989 return DAG.getNode(Opc, DL, VT,
33990 N0.getOperand(0), N1.getOperand(0),
33991 DAG.getNode(ISD::TRUNCATE, DL,
33993 } else if (ShAmt1.getOpcode() == ISD::XOR) {
33994 SDValue Mask = ShAmt1.getOperand(1);
33995 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
33996 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
33997 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
33998 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
33999 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
34000 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
34001 if (Op1.getOpcode() == InnerShift &&
34002 isa<ConstantSDNode>(Op1.getOperand(1)) &&
34003 Op1.getConstantOperandVal(1) == 1) {
34004 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
34005 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
34007 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
34008 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
34009 Op1.getOperand(0) == Op1.getOperand(1)) {
34010 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
34011 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
34020 /// Try to turn tests against the signbit in the form of:
34021 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
34024 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
34025 // This is only worth doing if the output type is i8 or i1.
34026 EVT ResultType = N->getValueType(0);
34027 if (ResultType != MVT::i8 && ResultType != MVT::i1)
34030 SDValue N0 = N->getOperand(0);
34031 SDValue N1 = N->getOperand(1);
34033 // We should be performing an xor against a truncated shift.
34034 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
34037 // Make sure we are performing an xor against one.
34038 if (!isOneConstant(N1))
34041 // SetCC on x86 zero extends so only act on this if it's a logical shift.
34042 SDValue Shift = N0.getOperand(0);
34043 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
34046 // Make sure we are truncating from one of i16, i32 or i64.
34047 EVT ShiftTy = Shift.getValueType();
34048 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
34051 // Make sure the shift amount extracts the sign bit.
34052 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
34053 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
34056 // Create a greater-than comparison against -1.
34057 // N.B. Using SETGE against 0 works but we want a canonical looking
34058 // comparison, using SETGT matches up with what TranslateX86CC.
34060 SDValue ShiftOp = Shift.getOperand(0);
34061 EVT ShiftOpTy = ShiftOp.getValueType();
34062 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34063 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
34064 *DAG.getContext(), ResultType);
34065 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
34066 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
34067 if (SetCCResultType != ResultType)
34068 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
34072 /// Turn vector tests of the signbit in the form of:
34073 /// xor (sra X, elt_size(X)-1), -1
34077 /// This should be called before type legalization because the pattern may not
34078 /// persist after that.
34079 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
34080 const X86Subtarget &Subtarget) {
34081 EVT VT = N->getValueType(0);
34082 if (!VT.isSimple())
34085 switch (VT.getSimpleVT().SimpleTy) {
34086 default: return SDValue();
34089 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
34090 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
34094 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
34097 // There must be a shift right algebraic before the xor, and the xor must be a
34098 // 'not' operation.
34099 SDValue Shift = N->getOperand(0);
34100 SDValue Ones = N->getOperand(1);
34101 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
34102 !ISD::isBuildVectorAllOnes(Ones.getNode()))
34105 // The shift should be smearing the sign bit across each vector element.
34106 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
34110 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
34111 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
34112 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
34115 // Create a greater-than comparison against -1. We don't use the more obvious
34116 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
34117 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
34120 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
34121 /// is valid for the given \p Subtarget.
34122 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
34123 const X86Subtarget &Subtarget) {
34124 if (!Subtarget.hasAVX512())
34127 // FIXME: Scalar type may be supported if we move it to vector register.
34128 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
34131 EVT SrcElVT = SrcVT.getScalarType();
34132 EVT DstElVT = DstVT.getScalarType();
34133 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
34135 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
34137 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
34138 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
34142 /// Detect a pattern of truncation with saturation:
34143 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
34144 /// Return the source value to be truncated or SDValue() if the pattern was not
34146 static SDValue detectUSatPattern(SDValue In, EVT VT) {
34147 if (In.getOpcode() != ISD::UMIN)
34150 //Saturation with truncation. We truncate from InVT to VT.
34151 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
34152 "Unexpected types for truncate operation");
34155 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
34156 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
34157 // the element size of the destination type.
34158 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
34164 /// Detect a pattern of truncation with saturation:
34165 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
34166 /// The types should allow to use VPMOVUS* instruction on AVX512.
34167 /// Return the source value to be truncated or SDValue() if the pattern was not
34169 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
34170 const X86Subtarget &Subtarget) {
34171 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
34173 return detectUSatPattern(In, VT);
34177 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
34178 const X86Subtarget &Subtarget) {
34179 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34180 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
34182 if (auto USatVal = detectUSatPattern(In, VT))
34183 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
34184 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
34188 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
34189 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
34190 /// X86ISD::AVG instruction.
34191 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
34192 const X86Subtarget &Subtarget,
34194 if (!VT.isVector() || !VT.isSimple())
34196 EVT InVT = In.getValueType();
34197 unsigned NumElems = VT.getVectorNumElements();
34199 EVT ScalarVT = VT.getVectorElementType();
34200 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
34201 isPowerOf2_32(NumElems)))
34204 // InScalarVT is the intermediate type in AVG pattern and it should be greater
34205 // than the original input type (i8/i16).
34206 EVT InScalarVT = InVT.getVectorElementType();
34207 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
34210 if (!Subtarget.hasSSE2())
34213 // Detect the following pattern:
34215 // %1 = zext <N x i8> %a to <N x i32>
34216 // %2 = zext <N x i8> %b to <N x i32>
34217 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
34218 // %4 = add nuw nsw <N x i32> %3, %2
34219 // %5 = lshr <N x i32> %N, <i32 1 x N>
34220 // %6 = trunc <N x i32> %5 to <N x i8>
34222 // In AVX512, the last instruction can also be a trunc store.
34223 if (In.getOpcode() != ISD::SRL)
34226 // A lambda checking the given SDValue is a constant vector and each element
34227 // is in the range [Min, Max].
34228 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
34229 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
34230 if (!BV || !BV->isConstant())
34232 for (SDValue Op : V->ops()) {
34233 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
34236 uint64_t Val = C->getZExtValue();
34237 if (Val < Min || Val > Max)
34243 // Split vectors to legal target size and apply AVG.
34244 auto LowerToAVG = [&](SDValue Op0, SDValue Op1) {
34245 unsigned NumSubs = 1;
34246 if (Subtarget.hasBWI()) {
34247 if (VT.getSizeInBits() > 512)
34248 NumSubs = VT.getSizeInBits() / 512;
34249 } else if (Subtarget.hasAVX2()) {
34250 if (VT.getSizeInBits() > 256)
34251 NumSubs = VT.getSizeInBits() / 256;
34253 if (VT.getSizeInBits() > 128)
34254 NumSubs = VT.getSizeInBits() / 128;
34258 return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1);
34260 SmallVector<SDValue, 4> Subs;
34261 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
34262 VT.getVectorNumElements() / NumSubs);
34263 for (unsigned i = 0; i != NumSubs; ++i) {
34264 unsigned Idx = i * SubVT.getVectorNumElements();
34265 SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
34266 SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
34267 Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS));
34269 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
34272 // Check if each element of the vector is left-shifted by one.
34273 auto LHS = In.getOperand(0);
34274 auto RHS = In.getOperand(1);
34275 if (!IsConstVectorInRange(RHS, 1, 1))
34277 if (LHS.getOpcode() != ISD::ADD)
34280 // Detect a pattern of a + b + 1 where the order doesn't matter.
34281 SDValue Operands[3];
34282 Operands[0] = LHS.getOperand(0);
34283 Operands[1] = LHS.getOperand(1);
34285 // Take care of the case when one of the operands is a constant vector whose
34286 // element is in the range [1, 256].
34287 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
34288 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
34289 Operands[0].getOperand(0).getValueType() == VT) {
34290 // The pattern is detected. Subtract one from the constant vector, then
34291 // demote it and emit X86ISD::AVG instruction.
34292 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
34293 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
34294 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
34295 return LowerToAVG(Operands[0].getOperand(0), Operands[1]);
34298 if (Operands[0].getOpcode() == ISD::ADD)
34299 std::swap(Operands[0], Operands[1]);
34300 else if (Operands[1].getOpcode() != ISD::ADD)
34302 Operands[2] = Operands[1].getOperand(0);
34303 Operands[1] = Operands[1].getOperand(1);
34305 // Now we have three operands of two additions. Check that one of them is a
34306 // constant vector with ones, and the other two are promoted from i8/i16.
34307 for (int i = 0; i < 3; ++i) {
34308 if (!IsConstVectorInRange(Operands[i], 1, 1))
34310 std::swap(Operands[i], Operands[2]);
34312 // Check if Operands[0] and Operands[1] are results of type promotion.
34313 for (int j = 0; j < 2; ++j)
34314 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
34315 Operands[j].getOperand(0).getValueType() != VT)
34318 // The pattern is detected, emit X86ISD::AVG instruction.
34319 return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0));
34325 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
34326 TargetLowering::DAGCombinerInfo &DCI,
34327 const X86Subtarget &Subtarget) {
34328 LoadSDNode *Ld = cast<LoadSDNode>(N);
34329 EVT RegVT = Ld->getValueType(0);
34330 EVT MemVT = Ld->getMemoryVT();
34332 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34334 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
34335 // into two 16-byte operations. Also split non-temporal aligned loads on
34336 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
34337 ISD::LoadExtType Ext = Ld->getExtensionType();
34339 unsigned AddressSpace = Ld->getAddressSpace();
34340 unsigned Alignment = Ld->getAlignment();
34341 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
34342 Ext == ISD::NON_EXTLOAD &&
34343 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
34344 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
34345 AddressSpace, Alignment, &Fast) && !Fast))) {
34346 unsigned NumElems = RegVT.getVectorNumElements();
34350 SDValue Ptr = Ld->getBasePtr();
34352 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
34355 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
34356 Alignment, Ld->getMemOperand()->getFlags());
34358 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
34360 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
34361 Ld->getPointerInfo().getWithOffset(16),
34362 MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
34363 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34365 Load2.getValue(1));
34367 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
34368 return DCI.CombineTo(N, NewVec, TF, true);
34374 /// If V is a build vector of boolean constants and exactly one of those
34375 /// constants is true, return the operand index of that true element.
34376 /// Otherwise, return -1.
34377 static int getOneTrueElt(SDValue V) {
34378 // This needs to be a build vector of booleans.
34379 // TODO: Checking for the i1 type matches the IR definition for the mask,
34380 // but the mask check could be loosened to i8 or other types. That might
34381 // also require checking more than 'allOnesValue'; eg, the x86 HW
34382 // instructions only require that the MSB is set for each mask element.
34383 // The ISD::MSTORE comments/definition do not specify how the mask operand
34385 auto *BV = dyn_cast<BuildVectorSDNode>(V);
34386 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
34389 int TrueIndex = -1;
34390 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
34391 for (unsigned i = 0; i < NumElts; ++i) {
34392 const SDValue &Op = BV->getOperand(i);
34395 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
34398 if (ConstNode->getAPIntValue().isAllOnesValue()) {
34399 // If we already found a one, this is too many.
34400 if (TrueIndex >= 0)
34408 /// Given a masked memory load/store operation, return true if it has one mask
34409 /// bit set. If it has one mask bit set, then also return the memory address of
34410 /// the scalar element to load/store, the vector index to insert/extract that
34411 /// scalar element, and the alignment for the scalar memory access.
34412 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
34413 SelectionDAG &DAG, SDValue &Addr,
34414 SDValue &Index, unsigned &Alignment) {
34415 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
34416 if (TrueMaskElt < 0)
34419 // Get the address of the one scalar element that is specified by the mask
34420 // using the appropriate offset from the base pointer.
34421 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
34422 Addr = MaskedOp->getBasePtr();
34423 if (TrueMaskElt != 0) {
34424 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
34425 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
34428 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
34429 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
34433 /// If exactly one element of the mask is set for a non-extending masked load,
34434 /// it is a scalar load and vector insert.
34435 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
34436 /// mask have already been optimized in IR, so we don't bother with those here.
34438 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
34439 TargetLowering::DAGCombinerInfo &DCI) {
34440 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
34441 // However, some target hooks may need to be added to know when the transform
34442 // is profitable. Endianness would also have to be considered.
34444 SDValue Addr, VecIndex;
34445 unsigned Alignment;
34446 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
34449 // Load the one scalar element that is specified by the mask using the
34450 // appropriate offset from the base pointer.
34452 EVT VT = ML->getValueType(0);
34453 EVT EltVT = VT.getVectorElementType();
34455 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
34456 Alignment, ML->getMemOperand()->getFlags());
34458 // Insert the loaded element into the appropriate place in the vector.
34459 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
34461 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
34465 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
34466 TargetLowering::DAGCombinerInfo &DCI) {
34467 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
34471 EVT VT = ML->getValueType(0);
34473 // If we are loading the first and last elements of a vector, it is safe and
34474 // always faster to load the whole vector. Replace the masked load with a
34475 // vector load and select.
34476 unsigned NumElts = VT.getVectorNumElements();
34477 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
34478 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
34479 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
34480 if (LoadFirstElt && LoadLastElt) {
34481 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
34482 ML->getMemOperand());
34483 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
34484 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
34487 // Convert a masked load with a constant mask into a masked load and a select.
34488 // This allows the select operation to use a faster kind of select instruction
34489 // (for example, vblendvps -> vblendps).
34491 // Don't try this if the pass-through operand is already undefined. That would
34492 // cause an infinite loop because that's what we're about to create.
34493 if (ML->getSrc0().isUndef())
34496 // The new masked load has an undef pass-through operand. The select uses the
34497 // original pass-through operand.
34498 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
34499 ML->getMask(), DAG.getUNDEF(VT),
34500 ML->getMemoryVT(), ML->getMemOperand(),
34501 ML->getExtensionType());
34502 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
34504 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
34507 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
34508 TargetLowering::DAGCombinerInfo &DCI,
34509 const X86Subtarget &Subtarget) {
34510 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
34512 // TODO: Expanding load with constant mask may be optimized as well.
34513 if (Mld->isExpandingLoad())
34516 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
34517 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
34519 // TODO: Do some AVX512 subsets benefit from this transform?
34520 if (!Subtarget.hasAVX512())
34521 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
34525 if (Mld->getExtensionType() != ISD::SEXTLOAD)
34528 // Resolve extending loads.
34529 EVT VT = Mld->getValueType(0);
34530 unsigned NumElems = VT.getVectorNumElements();
34531 EVT LdVT = Mld->getMemoryVT();
34534 assert(LdVT != VT && "Cannot extend to the same type");
34535 unsigned ToSz = VT.getScalarSizeInBits();
34536 unsigned FromSz = LdVT.getScalarSizeInBits();
34537 // From/To sizes and ElemCount must be pow of two.
34538 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
34539 "Unexpected size for extending masked load");
34541 unsigned SizeRatio = ToSz / FromSz;
34542 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
34544 // Create a type on which we perform the shuffle.
34545 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34546 LdVT.getScalarType(), NumElems*SizeRatio);
34547 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34549 // Convert Src0 value.
34550 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
34551 if (!Mld->getSrc0().isUndef()) {
34552 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34553 for (unsigned i = 0; i != NumElems; ++i)
34554 ShuffleVec[i] = i * SizeRatio;
34556 // Can't shuffle using an illegal type.
34557 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
34558 "WideVecVT should be legal");
34559 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
34560 DAG.getUNDEF(WideVecVT), ShuffleVec);
34563 // Prepare the new mask.
34565 SDValue Mask = Mld->getMask();
34566 if (Mask.getValueType() == VT) {
34567 // Mask and original value have the same type.
34568 NewMask = DAG.getBitcast(WideVecVT, Mask);
34569 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34570 for (unsigned i = 0; i != NumElems; ++i)
34571 ShuffleVec[i] = i * SizeRatio;
34572 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
34573 ShuffleVec[i] = NumElems * SizeRatio;
34574 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
34575 DAG.getConstant(0, dl, WideVecVT),
34578 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
34579 unsigned WidenNumElts = NumElems*SizeRatio;
34580 unsigned MaskNumElts = VT.getVectorNumElements();
34581 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
34584 unsigned NumConcat = WidenNumElts / MaskNumElts;
34585 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
34586 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
34588 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
34591 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
34592 Mld->getBasePtr(), NewMask, WideSrc0,
34593 Mld->getMemoryVT(), Mld->getMemOperand(),
34595 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
34596 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
34599 /// If exactly one element of the mask is set for a non-truncating masked store,
34600 /// it is a vector extract and scalar store.
34601 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
34602 /// mask have already been optimized in IR, so we don't bother with those here.
34603 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
34604 SelectionDAG &DAG) {
34605 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
34606 // However, some target hooks may need to be added to know when the transform
34607 // is profitable. Endianness would also have to be considered.
34609 SDValue Addr, VecIndex;
34610 unsigned Alignment;
34611 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
34614 // Extract the one scalar element that is actually being stored.
34616 EVT VT = MS->getValue().getValueType();
34617 EVT EltVT = VT.getVectorElementType();
34618 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
34619 MS->getValue(), VecIndex);
34621 // Store that element at the appropriate offset from the base pointer.
34622 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
34623 Alignment, MS->getMemOperand()->getFlags());
34626 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
34627 const X86Subtarget &Subtarget) {
34628 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
34630 if (Mst->isCompressingStore())
34633 if (!Mst->isTruncatingStore()) {
34634 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
34635 return ScalarStore;
34637 // If the mask is checking (0 > X), we're creating a vector with all-zeros
34638 // or all-ones elements based on the sign bits of X. AVX1 masked store only
34639 // cares about the sign bit of each mask element, so eliminate the compare:
34640 // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
34641 // Note that by waiting to match an x86-specific PCMPGT node, we're
34642 // eliminating potentially more complex matching of a setcc node which has
34643 // a full range of predicates.
34644 SDValue Mask = Mst->getMask();
34645 if (Mask.getOpcode() == X86ISD::PCMPGT &&
34646 ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
34647 assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
34648 "Unexpected type for PCMPGT");
34649 return DAG.getMaskedStore(
34650 Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
34651 Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
34654 // TODO: AVX512 targets should also be able to simplify something like the
34655 // pattern above, but that pattern will be different. It will either need to
34656 // match setcc more generally or match PCMPGTM later (in tablegen?).
34661 // Resolve truncating stores.
34662 EVT VT = Mst->getValue().getValueType();
34663 unsigned NumElems = VT.getVectorNumElements();
34664 EVT StVT = Mst->getMemoryVT();
34667 assert(StVT != VT && "Cannot truncate to the same type");
34668 unsigned FromSz = VT.getScalarSizeInBits();
34669 unsigned ToSz = StVT.getScalarSizeInBits();
34671 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34673 // The truncating store is legal in some cases. For example
34674 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
34675 // are designated for truncate store.
34676 // In this case we don't need any further transformations.
34677 if (TLI.isTruncStoreLegal(VT, StVT))
34680 // From/To sizes and ElemCount must be pow of two.
34681 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
34682 "Unexpected size for truncating masked store");
34683 // We are going to use the original vector elt for storing.
34684 // Accumulated smaller vector elements must be a multiple of the store size.
34685 assert (((NumElems * FromSz) % ToSz) == 0 &&
34686 "Unexpected ratio for truncating masked store");
34688 unsigned SizeRatio = FromSz / ToSz;
34689 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
34691 // Create a type on which we perform the shuffle.
34692 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34693 StVT.getScalarType(), NumElems*SizeRatio);
34695 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34697 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
34698 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34699 for (unsigned i = 0; i != NumElems; ++i)
34700 ShuffleVec[i] = i * SizeRatio;
34702 // Can't shuffle using an illegal type.
34703 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
34704 "WideVecVT should be legal");
34706 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
34707 DAG.getUNDEF(WideVecVT),
34711 SDValue Mask = Mst->getMask();
34712 if (Mask.getValueType() == VT) {
34713 // Mask and original value have the same type.
34714 NewMask = DAG.getBitcast(WideVecVT, Mask);
34715 for (unsigned i = 0; i != NumElems; ++i)
34716 ShuffleVec[i] = i * SizeRatio;
34717 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
34718 ShuffleVec[i] = NumElems*SizeRatio;
34719 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
34720 DAG.getConstant(0, dl, WideVecVT),
34723 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
34724 unsigned WidenNumElts = NumElems*SizeRatio;
34725 unsigned MaskNumElts = VT.getVectorNumElements();
34726 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
34729 unsigned NumConcat = WidenNumElts / MaskNumElts;
34730 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
34731 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
34733 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
34736 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
34737 Mst->getBasePtr(), NewMask, StVT,
34738 Mst->getMemOperand(), false);
34741 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
34742 const X86Subtarget &Subtarget) {
34743 StoreSDNode *St = cast<StoreSDNode>(N);
34744 EVT VT = St->getValue().getValueType();
34745 EVT StVT = St->getMemoryVT();
34747 SDValue StoredVal = St->getOperand(1);
34748 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34750 // If we are saving a concatenation of two XMM registers and 32-byte stores
34751 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
34753 unsigned AddressSpace = St->getAddressSpace();
34754 unsigned Alignment = St->getAlignment();
34755 if (VT.is256BitVector() && StVT == VT &&
34756 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
34757 AddressSpace, Alignment, &Fast) &&
34759 unsigned NumElems = VT.getVectorNumElements();
34763 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
34764 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
34766 SDValue Ptr0 = St->getBasePtr();
34767 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
34770 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
34771 Alignment, St->getMemOperand()->getFlags());
34773 DAG.getStore(St->getChain(), dl, Value1, Ptr1,
34774 St->getPointerInfo().getWithOffset(16),
34775 MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
34776 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
34779 // Optimize trunc store (of multiple scalars) to shuffle and store.
34780 // First, pack all of the elements in one place. Next, store to memory
34781 // in fewer chunks.
34782 if (St->isTruncatingStore() && VT.isVector()) {
34783 // Check if we can detect an AVG pattern from the truncation. If yes,
34784 // replace the trunc store by a normal store with the result of X86ISD::AVG
34786 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
34788 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
34789 St->getPointerInfo(), St->getAlignment(),
34790 St->getMemOperand()->getFlags());
34793 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
34794 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
34795 dl, Val, St->getBasePtr(),
34796 St->getMemoryVT(), St->getMemOperand(), DAG);
34798 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34799 unsigned NumElems = VT.getVectorNumElements();
34800 assert(StVT != VT && "Cannot truncate to the same type");
34801 unsigned FromSz = VT.getScalarSizeInBits();
34802 unsigned ToSz = StVT.getScalarSizeInBits();
34804 // The truncating store is legal in some cases. For example
34805 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
34806 // are designated for truncate store.
34807 // In this case we don't need any further transformations.
34808 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
34811 // From, To sizes and ElemCount must be pow of two
34812 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
34813 // We are going to use the original vector elt for storing.
34814 // Accumulated smaller vector elements must be a multiple of the store size.
34815 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
34817 unsigned SizeRatio = FromSz / ToSz;
34819 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
34821 // Create a type on which we perform the shuffle
34822 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34823 StVT.getScalarType(), NumElems*SizeRatio);
34825 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34827 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
34828 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
34829 for (unsigned i = 0; i != NumElems; ++i)
34830 ShuffleVec[i] = i * SizeRatio;
34832 // Can't shuffle using an illegal type.
34833 if (!TLI.isTypeLegal(WideVecVT))
34836 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
34837 DAG.getUNDEF(WideVecVT),
34839 // At this point all of the data is stored at the bottom of the
34840 // register. We now need to save it to mem.
34842 // Find the largest store unit
34843 MVT StoreType = MVT::i8;
34844 for (MVT Tp : MVT::integer_valuetypes()) {
34845 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
34849 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
34850 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
34851 (64 <= NumElems * ToSz))
34852 StoreType = MVT::f64;
34854 // Bitcast the original vector into a vector of store-size units
34855 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
34856 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
34857 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
34858 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
34859 SmallVector<SDValue, 8> Chains;
34860 SDValue Ptr = St->getBasePtr();
34862 // Perform one or more big stores into memory.
34863 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
34864 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
34865 StoreType, ShuffWide,
34866 DAG.getIntPtrConstant(i, dl));
34868 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
34869 St->getAlignment(), St->getMemOperand()->getFlags());
34870 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
34871 Chains.push_back(Ch);
34874 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
34877 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
34878 // the FP state in cases where an emms may be missing.
34879 // A preferable solution to the general problem is to figure out the right
34880 // places to insert EMMS. This qualifies as a quick hack.
34882 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
34883 if (VT.getSizeInBits() != 64)
34886 const Function &F = DAG.getMachineFunction().getFunction();
34887 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
34889 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
34890 if ((VT.isVector() ||
34891 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
34892 isa<LoadSDNode>(St->getValue()) &&
34893 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
34894 St->getChain().hasOneUse() && !St->isVolatile()) {
34895 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
34896 SmallVector<SDValue, 8> Ops;
34898 if (!ISD::isNormalLoad(Ld))
34901 // If this is not the MMX case, i.e. we are just turning i64 load/store
34902 // into f64 load/store, avoid the transformation if there are multiple
34903 // uses of the loaded value.
34904 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
34909 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
34910 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
34912 if (Subtarget.is64Bit() || F64IsLegal) {
34913 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
34914 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
34915 Ld->getMemOperand());
34917 // Make sure new load is placed in same chain order.
34918 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
34919 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
34920 St->getMemOperand());
34923 // Otherwise, lower to two pairs of 32-bit loads / stores.
34924 SDValue LoAddr = Ld->getBasePtr();
34925 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
34927 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
34928 Ld->getPointerInfo(), Ld->getAlignment(),
34929 Ld->getMemOperand()->getFlags());
34930 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
34931 Ld->getPointerInfo().getWithOffset(4),
34932 MinAlign(Ld->getAlignment(), 4),
34933 Ld->getMemOperand()->getFlags());
34934 // Make sure new loads are placed in same chain order.
34935 DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
34936 DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
34938 LoAddr = St->getBasePtr();
34939 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
34942 DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
34943 St->getAlignment(), St->getMemOperand()->getFlags());
34944 SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
34945 St->getPointerInfo().getWithOffset(4),
34946 MinAlign(St->getAlignment(), 4),
34947 St->getMemOperand()->getFlags());
34948 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
34951 // This is similar to the above case, but here we handle a scalar 64-bit
34952 // integer store that is extracted from a vector on a 32-bit target.
34953 // If we have SSE2, then we can treat it like a floating-point double
34954 // to get past legalization. The execution dependencies fixup pass will
34955 // choose the optimal machine instruction for the store if this really is
34956 // an integer or v2f32 rather than an f64.
34957 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
34958 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
34959 SDValue OldExtract = St->getOperand(1);
34960 SDValue ExtOp0 = OldExtract.getOperand(0);
34961 unsigned VecSize = ExtOp0.getValueSizeInBits();
34962 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
34963 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
34964 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
34965 BitCast, OldExtract.getOperand(1));
34966 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
34967 St->getPointerInfo(), St->getAlignment(),
34968 St->getMemOperand()->getFlags());
34974 /// Return 'true' if this vector operation is "horizontal"
34975 /// and return the operands for the horizontal operation in LHS and RHS. A
34976 /// horizontal operation performs the binary operation on successive elements
34977 /// of its first operand, then on successive elements of its second operand,
34978 /// returning the resulting values in a vector. For example, if
34979 /// A = < float a0, float a1, float a2, float a3 >
34981 /// B = < float b0, float b1, float b2, float b3 >
34982 /// then the result of doing a horizontal operation on A and B is
34983 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
34984 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
34985 /// A horizontal-op B, for some already available A and B, and if so then LHS is
34986 /// set to A, RHS to B, and the routine returns 'true'.
34987 /// Note that the binary operation should have the property that if one of the
34988 /// operands is UNDEF then the result is UNDEF.
34989 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
34990 // Look for the following pattern: if
34991 // A = < float a0, float a1, float a2, float a3 >
34992 // B = < float b0, float b1, float b2, float b3 >
34994 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
34995 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
34996 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
34997 // which is A horizontal-op B.
34999 // At least one of the operands should be a vector shuffle.
35000 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
35001 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
35004 MVT VT = LHS.getSimpleValueType();
35006 assert((VT.is128BitVector() || VT.is256BitVector()) &&
35007 "Unsupported vector type for horizontal add/sub");
35009 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
35010 // operate independently on 128-bit lanes.
35011 unsigned NumElts = VT.getVectorNumElements();
35012 unsigned NumLanes = VT.getSizeInBits()/128;
35013 unsigned NumLaneElts = NumElts / NumLanes;
35014 assert((NumLaneElts % 2 == 0) &&
35015 "Vector type should have an even number of elements in each lane");
35016 unsigned HalfLaneElts = NumLaneElts/2;
35018 // View LHS in the form
35019 // LHS = VECTOR_SHUFFLE A, B, LMask
35020 // If LHS is not a shuffle then pretend it is the shuffle
35021 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
35022 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
35025 SmallVector<int, 16> LMask(NumElts);
35026 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
35027 if (!LHS.getOperand(0).isUndef())
35028 A = LHS.getOperand(0);
35029 if (!LHS.getOperand(1).isUndef())
35030 B = LHS.getOperand(1);
35031 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
35032 std::copy(Mask.begin(), Mask.end(), LMask.begin());
35034 if (!LHS.isUndef())
35036 for (unsigned i = 0; i != NumElts; ++i)
35040 // Likewise, view RHS in the form
35041 // RHS = VECTOR_SHUFFLE C, D, RMask
35043 SmallVector<int, 16> RMask(NumElts);
35044 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
35045 if (!RHS.getOperand(0).isUndef())
35046 C = RHS.getOperand(0);
35047 if (!RHS.getOperand(1).isUndef())
35048 D = RHS.getOperand(1);
35049 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
35050 std::copy(Mask.begin(), Mask.end(), RMask.begin());
35052 if (!RHS.isUndef())
35054 for (unsigned i = 0; i != NumElts; ++i)
35058 // Check that the shuffles are both shuffling the same vectors.
35059 if (!(A == C && B == D) && !(A == D && B == C))
35062 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
35063 if (!A.getNode() && !B.getNode())
35066 // If A and B occur in reverse order in RHS, then "swap" them (which means
35067 // rewriting the mask).
35069 ShuffleVectorSDNode::commuteMask(RMask);
35071 // At this point LHS and RHS are equivalent to
35072 // LHS = VECTOR_SHUFFLE A, B, LMask
35073 // RHS = VECTOR_SHUFFLE A, B, RMask
35074 // Check that the masks correspond to performing a horizontal operation.
35075 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
35076 for (unsigned i = 0; i != NumLaneElts; ++i) {
35077 int LIdx = LMask[i+l], RIdx = RMask[i+l];
35079 // Ignore any UNDEF components.
35080 if (LIdx < 0 || RIdx < 0 ||
35081 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
35082 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
35085 // Check that successive elements are being operated on. If not, this is
35086 // not a horizontal operation.
35087 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
35088 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
35089 if (!(LIdx == Index && RIdx == Index + 1) &&
35090 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
35095 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
35096 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
35100 /// Do target-specific dag combines on floating-point adds/subs.
35101 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
35102 const X86Subtarget &Subtarget) {
35103 EVT VT = N->getValueType(0);
35104 SDValue LHS = N->getOperand(0);
35105 SDValue RHS = N->getOperand(1);
35106 bool IsFadd = N->getOpcode() == ISD::FADD;
35107 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
35109 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
35110 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
35111 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
35112 isHorizontalBinOp(LHS, RHS, IsFadd)) {
35113 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
35114 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
35119 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
35121 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
35122 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
35123 const X86Subtarget &Subtarget,
35125 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
35126 SDValue Src = N->getOperand(0);
35127 unsigned Opcode = Src.getOpcode();
35128 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35130 EVT VT = N->getValueType(0);
35131 EVT SrcVT = Src.getValueType();
35133 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
35134 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
35136 // Repeated operand, so we are only trading one output truncation for
35137 // one input truncation.
35141 // See if either operand has been extended from a smaller/equal size to
35142 // the truncation size, allowing a truncation to combine with the extend.
35143 unsigned Opcode0 = Op0.getOpcode();
35144 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
35145 Opcode0 == ISD::ZERO_EXTEND) &&
35146 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
35149 unsigned Opcode1 = Op1.getOpcode();
35150 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
35151 Opcode1 == ISD::ZERO_EXTEND) &&
35152 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
35155 // See if either operand is a single use constant which can be constant
35157 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
35158 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
35159 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
35160 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
35163 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
35164 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
35165 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
35166 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
35169 // Don't combine if the operation has other uses.
35170 if (!N->isOnlyUserOf(Src.getNode()))
35173 // Only support vector truncation for now.
35174 // TODO: i64 scalar math would benefit as well.
35175 if (!VT.isVector())
35178 // In most cases its only worth pre-truncating if we're only facing the cost
35179 // of one truncation.
35180 // i.e. if one of the inputs will constant fold or the input is repeated.
35185 SDValue Op0 = Src.getOperand(0);
35186 SDValue Op1 = Src.getOperand(1);
35187 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
35188 IsRepeatedOpOrFreeTruncation(Op0, Op1))
35189 return TruncateArithmetic(Op0, Op1);
35194 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
35195 // better to truncate if we have the chance.
35196 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
35197 !Subtarget.hasDQI())
35198 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
35201 // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
35202 SDValue Op0 = Src.getOperand(0);
35203 SDValue Op1 = Src.getOperand(1);
35204 if (TLI.isOperationLegal(Opcode, VT) &&
35205 IsRepeatedOpOrFreeTruncation(Op0, Op1))
35206 return TruncateArithmetic(Op0, Op1);
35214 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
35216 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
35217 SmallVector<SDValue, 8> &Regs) {
35218 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
35219 Regs[0].getValueType() == MVT::v2i64));
35220 EVT OutVT = N->getValueType(0);
35221 EVT OutSVT = OutVT.getVectorElementType();
35222 EVT InVT = Regs[0].getValueType();
35223 EVT InSVT = InVT.getVectorElementType();
35226 // First, use mask to unset all bits that won't appear in the result.
35227 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
35228 "OutSVT can only be either i8 or i16.");
35230 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
35231 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
35232 for (auto &Reg : Regs)
35233 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
35235 MVT UnpackedVT, PackedVT;
35236 if (OutSVT == MVT::i8) {
35237 UnpackedVT = MVT::v8i16;
35238 PackedVT = MVT::v16i8;
35240 UnpackedVT = MVT::v4i32;
35241 PackedVT = MVT::v8i16;
35244 // In each iteration, truncate the type by a half size.
35245 auto RegNum = Regs.size();
35246 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
35247 j < e; j *= 2, RegNum /= 2) {
35248 for (unsigned i = 0; i < RegNum; i++)
35249 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
35250 for (unsigned i = 0; i < RegNum / 2; i++)
35251 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
35255 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
35256 // then extract a subvector as the result since v8i8 is not a legal type.
35257 if (OutVT == MVT::v8i8) {
35258 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
35259 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
35260 DAG.getIntPtrConstant(0, DL));
35262 } else if (RegNum > 1) {
35263 Regs.resize(RegNum);
35264 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
35269 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
35271 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
35273 SmallVector<SDValue, 8> &Regs) {
35274 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
35275 EVT OutVT = N->getValueType(0);
35278 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
35279 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
35280 for (auto &Reg : Regs) {
35281 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
35283 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
35287 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
35288 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
35291 if (Regs.size() > 2) {
35292 Regs.resize(Regs.size() / 2);
35293 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
35298 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
35299 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
35300 /// legalization the truncation will be translated into a BUILD_VECTOR with each
35301 /// element that is extracted from a vector and then truncated, and it is
35302 /// difficult to do this optimization based on them.
35303 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
35304 const X86Subtarget &Subtarget) {
35305 EVT OutVT = N->getValueType(0);
35306 if (!OutVT.isVector())
35309 SDValue In = N->getOperand(0);
35310 if (!In.getValueType().isSimple())
35313 EVT InVT = In.getValueType();
35314 unsigned NumElems = OutVT.getVectorNumElements();
35316 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
35317 // SSE2, and we need to take care of it specially.
35318 // AVX512 provides vpmovdb.
35319 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
35322 EVT OutSVT = OutVT.getVectorElementType();
35323 EVT InSVT = InVT.getVectorElementType();
35324 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
35325 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
35329 // SSSE3's pshufb results in less instructions in the cases below.
35330 if (Subtarget.hasSSSE3() && NumElems == 8 &&
35331 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
35332 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
35337 // Split a long vector into vectors of legal type.
35338 unsigned RegNum = InVT.getSizeInBits() / 128;
35339 SmallVector<SDValue, 8> SubVec(RegNum);
35340 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
35341 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
35343 for (unsigned i = 0; i < RegNum; i++)
35344 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
35345 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
35347 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
35348 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
35349 // truncate 2 x v4i32 to v8i16.
35350 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
35351 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
35352 else if (InSVT == MVT::i32)
35353 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
35358 /// This function transforms vector truncation of 'extended sign-bits' or
35359 /// 'extended zero-bits' values.
35360 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
35361 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
35363 const X86Subtarget &Subtarget) {
35364 // Requires SSE2 but AVX512 has fast truncate.
35365 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
35368 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
35371 SDValue In = N->getOperand(0);
35372 if (!In.getValueType().isSimple())
35375 MVT VT = N->getValueType(0).getSimpleVT();
35376 MVT SVT = VT.getScalarType();
35378 MVT InVT = In.getValueType().getSimpleVT();
35379 MVT InSVT = InVT.getScalarType();
35381 // Check we have a truncation suited for PACKSS.
35382 if (!VT.is128BitVector() && !VT.is256BitVector())
35384 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
35386 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
35389 // Use PACKSS if the input has sign-bits that extend all the way to the
35390 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
35391 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
35392 unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
35393 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
35394 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
35396 // Use PACKUS if the input has zero-bits that extend all the way to the
35397 // packed/truncated value. e.g. masks, zext_in_reg, etc.
35399 DAG.computeKnownBits(In, Known);
35400 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
35401 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
35402 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
35403 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
35408 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
35409 const X86Subtarget &Subtarget) {
35410 EVT VT = N->getValueType(0);
35411 SDValue Src = N->getOperand(0);
35414 // Attempt to pre-truncate inputs to arithmetic ops instead.
35415 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
35418 // Try to detect AVG pattern first.
35419 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
35422 // Try to combine truncation with unsigned saturation.
35423 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
35426 // The bitcast source is a direct mmx result.
35427 // Detect bitcasts between i32 to x86mmx
35428 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
35429 SDValue BCSrc = Src.getOperand(0);
35430 if (BCSrc.getValueType() == MVT::x86mmx)
35431 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
35434 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
35435 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
35438 return combineVectorTruncation(N, DAG, Subtarget);
35441 /// Returns the negated value if the node \p N flips sign of FP value.
35443 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
35444 /// AVX512F does not have FXOR, so FNEG is lowered as
35445 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
35446 /// In this case we go though all bitcasts.
35447 static SDValue isFNEG(SDNode *N) {
35448 if (N->getOpcode() == ISD::FNEG)
35449 return N->getOperand(0);
35451 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
35452 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
35455 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
35456 if (!Op1.getValueType().isFloatingPoint())
35459 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
35461 unsigned EltBits = Op1.getScalarValueSizeInBits();
35462 auto isSignMask = [&](const ConstantFP *C) {
35463 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
35466 // There is more than one way to represent the same constant on
35467 // the different X86 targets. The type of the node may also depend on size.
35468 // - load scalar value and broadcast
35469 // - BUILD_VECTOR node
35470 // - load from a constant pool.
35471 // We check all variants here.
35472 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
35473 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
35474 if (isSignMask(cast<ConstantFP>(C)))
35477 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
35478 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
35479 if (isSignMask(CN->getConstantFPValue()))
35482 } else if (auto *C = getTargetConstantFromNode(Op1)) {
35483 if (C->getType()->isVectorTy()) {
35484 if (auto *SplatV = C->getSplatValue())
35485 if (isSignMask(cast<ConstantFP>(SplatV)))
35487 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
35488 if (isSignMask(FPConst))
35494 /// Do target-specific dag combines on floating point negations.
35495 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
35496 const X86Subtarget &Subtarget) {
35497 EVT OrigVT = N->getValueType(0);
35498 SDValue Arg = isFNEG(N);
35499 assert(Arg.getNode() && "N is expected to be an FNEG node");
35501 EVT VT = Arg.getValueType();
35502 EVT SVT = VT.getScalarType();
35505 // Let legalize expand this if it isn't a legal type yet.
35506 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
35509 // If we're negating a FMUL node on a target with FMA, then we can avoid the
35510 // use of a constant by performing (-0 - A*B) instead.
35511 // FIXME: Check rounding control flags as well once it becomes available.
35512 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
35513 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
35514 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
35515 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
35516 Arg.getOperand(1), Zero);
35517 return DAG.getBitcast(OrigVT, NewNode);
35520 // If we're negating an FMA node, then we can adjust the
35521 // instruction to include the extra negation.
35522 unsigned NewOpcode = 0;
35523 if (Arg.hasOneUse()) {
35524 switch (Arg.getOpcode()) {
35525 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
35526 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
35527 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
35528 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
35529 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
35530 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
35531 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
35532 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
35533 // We can't handle scalar intrinsic node here because it would only
35534 // invert one element and not the whole vector. But we could try to handle
35535 // a negation of the lower element only.
35539 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
35540 Arg.getNode()->ops()));
35545 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
35546 const X86Subtarget &Subtarget) {
35547 MVT VT = N->getSimpleValueType(0);
35548 // If we have integer vector types available, use the integer opcodes.
35549 if (VT.isVector() && Subtarget.hasSSE2()) {
35552 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
35554 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
35555 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
35556 unsigned IntOpcode;
35557 switch (N->getOpcode()) {
35558 default: llvm_unreachable("Unexpected FP logic op");
35559 case X86ISD::FOR: IntOpcode = ISD::OR; break;
35560 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
35561 case X86ISD::FAND: IntOpcode = ISD::AND; break;
35562 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
35564 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
35565 return DAG.getBitcast(VT, IntOp);
35571 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
35572 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
35573 if (N->getOpcode() != ISD::XOR)
35576 SDValue LHS = N->getOperand(0);
35577 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
35578 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
35581 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
35582 X86::CondCode(LHS->getConstantOperandVal(0)));
35584 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
35587 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
35588 TargetLowering::DAGCombinerInfo &DCI,
35589 const X86Subtarget &Subtarget) {
35590 // If this is SSE1 only convert to FXOR to avoid scalarization.
35591 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
35592 N->getValueType(0) == MVT::v4i32) {
35593 return DAG.getBitcast(
35594 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
35595 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
35596 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
35599 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
35602 if (DCI.isBeforeLegalizeOps())
35605 if (SDValue SetCC = foldXor1SetCC(N, DAG))
35608 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
35611 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
35615 return combineFneg(N, DAG, Subtarget);
35620 static bool isNullFPScalarOrVectorConst(SDValue V) {
35621 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
35624 /// If a value is a scalar FP zero or a vector FP zero (potentially including
35625 /// undefined elements), return a zero constant that may be used to fold away
35626 /// that value. In the case of a vector, the returned constant will not contain
35627 /// undefined elements even if the input parameter does. This makes it suitable
35628 /// to be used as a replacement operand with operations (eg, bitwise-and) where
35629 /// an undef should not propagate.
35630 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
35631 const X86Subtarget &Subtarget) {
35632 if (!isNullFPScalarOrVectorConst(V))
35635 if (V.getValueType().isVector())
35636 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
35641 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
35642 const X86Subtarget &Subtarget) {
35643 SDValue N0 = N->getOperand(0);
35644 SDValue N1 = N->getOperand(1);
35645 EVT VT = N->getValueType(0);
35648 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
35649 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
35650 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
35651 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
35654 auto isAllOnesConstantFP = [](SDValue V) {
35655 if (V.getSimpleValueType().isVector())
35656 return ISD::isBuildVectorAllOnes(V.getNode());
35657 auto *C = dyn_cast<ConstantFPSDNode>(V);
35658 return C && C->getConstantFPValue()->isAllOnesValue();
35661 // fand (fxor X, -1), Y --> fandn X, Y
35662 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
35663 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
35665 // fand X, (fxor Y, -1) --> fandn Y, X
35666 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
35667 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
35672 /// Do target-specific dag combines on X86ISD::FAND nodes.
35673 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
35674 const X86Subtarget &Subtarget) {
35675 // FAND(0.0, x) -> 0.0
35676 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
35679 // FAND(x, 0.0) -> 0.0
35680 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
35683 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
35686 return lowerX86FPLogicOp(N, DAG, Subtarget);
35689 /// Do target-specific dag combines on X86ISD::FANDN nodes.
35690 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
35691 const X86Subtarget &Subtarget) {
35692 // FANDN(0.0, x) -> x
35693 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
35694 return N->getOperand(1);
35696 // FANDN(x, 0.0) -> 0.0
35697 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
35700 return lowerX86FPLogicOp(N, DAG, Subtarget);
35703 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
35704 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
35705 const X86Subtarget &Subtarget) {
35706 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
35708 // F[X]OR(0.0, x) -> x
35709 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
35710 return N->getOperand(1);
35712 // F[X]OR(x, 0.0) -> x
35713 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
35714 return N->getOperand(0);
35717 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
35720 return lowerX86FPLogicOp(N, DAG, Subtarget);
35723 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
35724 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
35725 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
35727 // Only perform optimizations if UnsafeMath is used.
35728 if (!DAG.getTarget().Options.UnsafeFPMath)
35731 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
35732 // into FMINC and FMAXC, which are Commutative operations.
35733 unsigned NewOp = 0;
35734 switch (N->getOpcode()) {
35735 default: llvm_unreachable("unknown opcode");
35736 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
35737 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
35740 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
35741 N->getOperand(0), N->getOperand(1));
35744 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
35745 const X86Subtarget &Subtarget) {
35746 if (Subtarget.useSoftFloat())
35749 // TODO: Check for global or instruction-level "nnan". In that case, we
35750 // should be able to lower to FMAX/FMIN alone.
35751 // TODO: If an operand is already known to be a NaN or not a NaN, this
35752 // should be an optional swap and FMAX/FMIN.
35754 EVT VT = N->getValueType(0);
35755 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
35756 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
35757 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
35760 // This takes at least 3 instructions, so favor a library call when operating
35761 // on a scalar and minimizing code size.
35762 if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
35765 SDValue Op0 = N->getOperand(0);
35766 SDValue Op1 = N->getOperand(1);
35768 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
35769 DAG.getDataLayout(), *DAG.getContext(), VT);
35771 // There are 4 possibilities involving NaN inputs, and these are the required
35775 // ----------------
35776 // Num | Max | Op0 |
35777 // Op0 ----------------
35778 // NaN | Op1 | NaN |
35779 // ----------------
35781 // The SSE FP max/min instructions were not designed for this case, but rather
35783 // Min = Op1 < Op0 ? Op1 : Op0
35784 // Max = Op1 > Op0 ? Op1 : Op0
35786 // So they always return Op0 if either input is a NaN. However, we can still
35787 // use those instructions for fmaxnum by selecting away a NaN input.
35789 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
35790 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
35791 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
35792 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
35794 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
35795 // are NaN, the NaN value of Op1 is the result.
35796 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
35799 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
35800 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
35801 TargetLowering::DAGCombinerInfo &DCI,
35802 const X86Subtarget &Subtarget) {
35803 // ANDNP(0, x) -> x
35804 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
35805 return N->getOperand(1);
35807 // ANDNP(x, 0) -> 0
35808 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
35809 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
35811 EVT VT = N->getValueType(0);
35813 // Attempt to recursively combine a bitmask ANDNP with shuffles.
35814 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
35816 if (SDValue Res = combineX86ShufflesRecursively(
35817 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
35818 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
35819 DCI.CombineTo(N, Res);
35827 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
35828 TargetLowering::DAGCombinerInfo &DCI) {
35829 SDValue N0 = N->getOperand(0);
35830 SDValue N1 = N->getOperand(1);
35832 // BT ignores high bits in the bit index operand.
35833 unsigned BitWidth = N1.getValueSizeInBits();
35834 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
35835 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
35836 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
35841 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
35842 const X86Subtarget &Subtarget) {
35843 EVT VT = N->getValueType(0);
35844 if (!VT.isVector())
35847 SDValue N0 = N->getOperand(0);
35848 SDValue N1 = N->getOperand(1);
35849 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
35852 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
35853 // both SSE and AVX2 since there is no sign-extended shift right
35854 // operation on a vector with 64-bit elements.
35855 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
35856 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
35857 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
35858 N0.getOpcode() == ISD::SIGN_EXTEND)) {
35859 SDValue N00 = N0.getOperand(0);
35861 // EXTLOAD has a better solution on AVX2,
35862 // it may be replaced with X86ISD::VSEXT node.
35863 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
35864 if (!ISD::isNormalLoad(N00.getNode()))
35867 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
35868 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
35870 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
35876 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
35877 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
35878 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
35879 /// opportunities to combine math ops, use an LEA, or use a complex addressing
35880 /// mode. This can eliminate extend, add, and shift instructions.
35881 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
35882 const X86Subtarget &Subtarget) {
35883 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
35884 Ext->getOpcode() != ISD::ZERO_EXTEND)
35887 // TODO: This should be valid for other integer types.
35888 EVT VT = Ext->getValueType(0);
35889 if (VT != MVT::i64)
35892 SDValue Add = Ext->getOperand(0);
35893 if (Add.getOpcode() != ISD::ADD)
35896 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
35897 bool NSW = Add->getFlags().hasNoSignedWrap();
35898 bool NUW = Add->getFlags().hasNoUnsignedWrap();
35900 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
35902 if ((Sext && !NSW) || (!Sext && !NUW))
35905 // Having a constant operand to the 'add' ensures that we are not increasing
35906 // the instruction count because the constant is extended for free below.
35907 // A constant operand can also become the displacement field of an LEA.
35908 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
35912 // Don't make the 'add' bigger if there's no hope of combining it with some
35913 // other 'add' or 'shl' instruction.
35914 // TODO: It may be profitable to generate simpler LEA instructions in place
35915 // of single 'add' instructions, but the cost model for selecting an LEA
35916 // currently has a high threshold.
35917 bool HasLEAPotential = false;
35918 for (auto *User : Ext->uses()) {
35919 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
35920 HasLEAPotential = true;
35924 if (!HasLEAPotential)
35927 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
35928 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
35929 SDValue AddOp0 = Add.getOperand(0);
35930 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
35931 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
35933 // The wider add is guaranteed to not wrap because both operands are
35936 Flags.setNoSignedWrap(NSW);
35937 Flags.setNoUnsignedWrap(NUW);
35938 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
35941 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
35942 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
35943 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
35944 /// extends from AH (which we otherwise need to do contortions to access).
35945 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
35946 SDValue N0 = N->getOperand(0);
35947 auto OpcodeN = N->getOpcode();
35948 auto OpcodeN0 = N0.getOpcode();
35949 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
35950 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
35953 EVT VT = N->getValueType(0);
35954 EVT InVT = N0.getValueType();
35955 if (N0.getResNo() != 1 || InVT != MVT::i8 ||
35956 !(VT == MVT::i32 || VT == MVT::i64))
35959 SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
35960 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
35961 : X86ISD::UDIVREM8_ZEXT_HREG;
35962 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
35964 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
35965 // If this was a 64-bit extend, complete it.
35966 if (VT == MVT::i64)
35967 return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
35968 return R.getValue(1);
35971 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
35972 // operands and the result of CMOV is not used anywhere else - promote CMOV
35973 // itself instead of promoting its result. This could be beneficial, because:
35974 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
35975 // (or more) pseudo-CMOVs only when they go one-after-another and
35976 // getting rid of result extension code after CMOV will help that.
35977 // 2) Promotion of constant CMOV arguments is free, hence the
35978 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
35979 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
35980 // promotion is also good in terms of code-size.
35981 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
35983 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
35984 SDValue CMovN = Extend->getOperand(0);
35985 if (CMovN.getOpcode() != X86ISD::CMOV)
35988 EVT TargetVT = Extend->getValueType(0);
35989 unsigned ExtendOpcode = Extend->getOpcode();
35992 EVT VT = CMovN.getValueType();
35993 SDValue CMovOp0 = CMovN.getOperand(0);
35994 SDValue CMovOp1 = CMovN.getOperand(1);
35996 bool DoPromoteCMOV =
35997 (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) &&
35998 CMovN.hasOneUse() &&
35999 (isa<ConstantSDNode>(CMovOp0.getNode()) &&
36000 isa<ConstantSDNode>(CMovOp1.getNode()));
36002 if (!DoPromoteCMOV)
36005 CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
36006 CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);
36008 return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
36009 CMovN.getOperand(2), CMovN.getOperand(3));
36012 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
36013 // This is more or less the reverse of combineBitcastvxi1.
36015 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
36016 TargetLowering::DAGCombinerInfo &DCI,
36017 const X86Subtarget &Subtarget) {
36018 unsigned Opcode = N->getOpcode();
36019 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
36020 Opcode != ISD::ANY_EXTEND)
36022 if (!DCI.isBeforeLegalizeOps())
36024 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
36027 SDValue N0 = N->getOperand(0);
36028 EVT VT = N->getValueType(0);
36029 EVT SVT = VT.getScalarType();
36030 EVT InSVT = N0.getValueType().getScalarType();
36031 unsigned EltSizeInBits = SVT.getSizeInBits();
36033 // Input type must be extending a bool vector (bit-casted from a scalar
36034 // integer) to legal integer types.
36035 if (!VT.isVector())
36037 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
36039 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
36042 SDValue N00 = N0.getOperand(0);
36043 EVT SclVT = N0.getOperand(0).getValueType();
36044 if (!SclVT.isScalarInteger())
36049 SmallVector<int, 32> ShuffleMask;
36050 unsigned NumElts = VT.getVectorNumElements();
36051 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
36053 // Broadcast the scalar integer to the vector elements.
36054 if (NumElts > EltSizeInBits) {
36055 // If the scalar integer is greater than the vector element size, then we
36056 // must split it down into sub-sections for broadcasting. For example:
36057 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
36058 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
36059 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
36060 unsigned Scale = NumElts / EltSizeInBits;
36062 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
36063 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
36064 Vec = DAG.getBitcast(VT, Vec);
36066 for (unsigned i = 0; i != Scale; ++i)
36067 ShuffleMask.append(EltSizeInBits, i);
36069 // For smaller scalar integers, we can simply any-extend it to the vector
36070 // element size (we don't care about the upper bits) and broadcast it to all
36072 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
36073 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
36074 ShuffleMask.append(NumElts, 0);
36076 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
36078 // Now, mask the relevant bit in each element.
36079 SmallVector<SDValue, 32> Bits;
36080 for (unsigned i = 0; i != NumElts; ++i) {
36081 int BitIdx = (i % EltSizeInBits);
36082 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
36083 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
36085 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
36086 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
36088 // Compare against the bitmask and extend the result.
36089 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
36090 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
36091 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
36093 // For SEXT, this is now done, otherwise shift the result down for
36095 if (Opcode == ISD::SIGN_EXTEND)
36097 return DAG.getNode(ISD::SRL, DL, VT, Vec,
36098 DAG.getConstant(EltSizeInBits - 1, DL, VT));
36101 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
36102 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
36103 /// with UNDEFs) of the input to vectors of the same size as the target type
36104 /// which then extends the lowest elements.
36105 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
36106 TargetLowering::DAGCombinerInfo &DCI,
36107 const X86Subtarget &Subtarget) {
36108 unsigned Opcode = N->getOpcode();
36109 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
36111 if (!DCI.isBeforeLegalizeOps())
36113 if (!Subtarget.hasSSE2())
36116 SDValue N0 = N->getOperand(0);
36117 EVT VT = N->getValueType(0);
36118 EVT SVT = VT.getScalarType();
36119 EVT InVT = N0.getValueType();
36120 EVT InSVT = InVT.getScalarType();
36122 // Input type must be a vector and we must be extending legal integer types.
36123 if (!VT.isVector())
36125 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
36127 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
36130 // On AVX2+ targets, if the input/output types are both legal then we will be
36131 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
36132 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
36133 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
36138 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
36139 EVT InVT = N.getValueType();
36140 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
36141 Size / InVT.getScalarSizeInBits());
36142 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
36143 DAG.getUNDEF(InVT));
36145 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
36148 // If target-size is less than 128-bits, extend to a type that would extend
36149 // to 128 bits, extend that and extract the original target vector.
36150 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
36151 unsigned Scale = 128 / VT.getSizeInBits();
36153 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
36154 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
36155 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
36156 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
36157 DAG.getIntPtrConstant(0, DL));
36160 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
36161 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
36162 // Also use this if we don't have SSE41 to allow the legalizer do its job.
36163 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
36164 (VT.is256BitVector() && Subtarget.hasInt256()) ||
36165 (VT.is512BitVector() && Subtarget.hasAVX512())) {
36166 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
36167 return Opcode == ISD::SIGN_EXTEND
36168 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
36169 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
36172 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
36173 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
36174 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
36175 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
36176 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
36178 SmallVector<SDValue, 8> Opnds;
36179 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
36180 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
36181 DAG.getIntPtrConstant(Offset, DL));
36182 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
36183 SrcVec = Opcode == ISD::SIGN_EXTEND
36184 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
36185 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
36186 Opnds.push_back(SrcVec);
36188 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
36191 // On pre-AVX2 targets, split into 128-bit nodes of
36192 // ISD::*_EXTEND_VECTOR_INREG.
36193 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
36194 return SplitAndExtendInReg(128);
36196 // On pre-AVX512 targets, split into 256-bit nodes of
36197 // ISD::*_EXTEND_VECTOR_INREG.
36198 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
36199 return SplitAndExtendInReg(256);
36204 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
36205 TargetLowering::DAGCombinerInfo &DCI,
36206 const X86Subtarget &Subtarget) {
36207 SDValue N0 = N->getOperand(0);
36208 EVT VT = N->getValueType(0);
36209 EVT InVT = N0.getValueType();
36212 if (SDValue DivRem8 = getDivRem8(N, DAG))
36215 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
36218 if (!DCI.isBeforeLegalizeOps())
36221 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
36222 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
36223 // Invert and sign-extend a boolean is the same as zero-extend and subtract
36224 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
36225 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
36226 // sext (xor Bool, -1) --> sub (zext Bool), 1
36227 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
36228 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
36231 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
36234 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
36238 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
36241 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
36247 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
36248 const X86Subtarget &Subtarget) {
36249 // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
36251 EVT VT = N->getValueType(0);
36253 // Let legalize expand this if it isn't a legal type yet.
36254 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
36257 EVT ScalarVT = VT.getScalarType();
36258 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
36261 SDValue A = N->getOperand(0);
36262 SDValue B = N->getOperand(1);
36263 SDValue C = N->getOperand(2);
36265 auto invertIfNegative = [](SDValue &V) {
36266 if (SDValue NegVal = isFNEG(V.getNode())) {
36273 // Do not convert the passthru input of scalar intrinsics.
36274 // FIXME: We could allow negations of the lower element only.
36275 bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
36276 N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
36277 bool NegB = invertIfNegative(B);
36278 bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
36279 N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
36281 // Negative multiplication when NegA xor NegB
36282 bool NegMul = (NegA != NegB);
36283 bool HasNeg = NegA || NegB || NegC;
36285 unsigned NewOpcode;
36287 NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
36289 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
36291 // For FMA, we risk reconstructing the node we started with.
36292 // In order to avoid this, we check for negation or opcode change. If
36293 // one of the two happened, then it is a new node and we return it.
36294 if (N->getOpcode() == ISD::FMA) {
36295 if (HasNeg || NewOpcode != N->getOpcode())
36296 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
36300 if (N->getOpcode() == X86ISD::FMADD_RND) {
36301 switch (NewOpcode) {
36302 case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
36303 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
36304 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
36305 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
36307 } else if (N->getOpcode() == X86ISD::FMADDS1) {
36308 switch (NewOpcode) {
36309 case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
36310 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
36311 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
36312 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
36314 } else if (N->getOpcode() == X86ISD::FMADDS3) {
36315 switch (NewOpcode) {
36316 case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
36317 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
36318 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
36319 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
36321 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
36322 switch (NewOpcode) {
36323 case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
36324 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
36325 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
36326 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
36328 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
36329 switch (NewOpcode) {
36330 case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
36331 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
36332 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
36333 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
36335 } else if (N->getOpcode() == X86ISD::FMADD4S) {
36336 switch (NewOpcode) {
36337 case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
36338 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
36339 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
36340 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
36343 llvm_unreachable("Unexpected opcode!");
36346 // Only return the node is the opcode was changed or one of the
36347 // operand was negated. If not, we'll just recreate the same node.
36348 if (HasNeg || NewOpcode != N->getOpcode()) {
36349 if (N->getNumOperands() == 4)
36350 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
36351 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
36357 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
36358 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
36359 const X86Subtarget &Subtarget) {
36361 EVT VT = N->getValueType(0);
36363 SDValue NegVal = isFNEG(N->getOperand(2).getNode());
36367 unsigned NewOpcode;
36368 switch (N->getOpcode()) {
36369 default: llvm_unreachable("Unexpected opcode!");
36370 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
36371 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
36372 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
36373 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
36376 if (N->getNumOperands() == 4)
36377 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
36378 NegVal, N->getOperand(3));
36379 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
36383 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
36384 TargetLowering::DAGCombinerInfo &DCI,
36385 const X86Subtarget &Subtarget) {
36386 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
36387 // (and (i32 x86isd::setcc_carry), 1)
36388 // This eliminates the zext. This transformation is necessary because
36389 // ISD::SETCC is always legalized to i8.
36391 SDValue N0 = N->getOperand(0);
36392 EVT VT = N->getValueType(0);
36394 if (N0.getOpcode() == ISD::AND &&
36396 N0.getOperand(0).hasOneUse()) {
36397 SDValue N00 = N0.getOperand(0);
36398 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
36399 if (!isOneConstant(N0.getOperand(1)))
36401 return DAG.getNode(ISD::AND, dl, VT,
36402 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
36403 N00.getOperand(0), N00.getOperand(1)),
36404 DAG.getConstant(1, dl, VT));
36408 if (N0.getOpcode() == ISD::TRUNCATE &&
36410 N0.getOperand(0).hasOneUse()) {
36411 SDValue N00 = N0.getOperand(0);
36412 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
36413 return DAG.getNode(ISD::AND, dl, VT,
36414 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
36415 N00.getOperand(0), N00.getOperand(1)),
36416 DAG.getConstant(1, dl, VT));
36420 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
36423 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
36426 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
36430 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
36433 if (SDValue DivRem8 = getDivRem8(N, DAG))
36436 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
36439 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
36445 /// Try to map a 128-bit or larger integer comparison to vector instructions
36446 /// before type legalization splits it up into chunks.
36447 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
36448 const X86Subtarget &Subtarget) {
36449 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
36450 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
36452 // We're looking for an oversized integer equality comparison.
36453 SDValue X = SetCC->getOperand(0);
36454 SDValue Y = SetCC->getOperand(1);
36455 EVT OpVT = X.getValueType();
36456 unsigned OpSize = OpVT.getSizeInBits();
36457 if (!OpVT.isScalarInteger() || OpSize < 128)
36460 // Ignore a comparison with zero because that gets special treatment in
36461 // EmitTest(). But make an exception for the special case of a pair of
36462 // logically-combined vector-sized operands compared to zero. This pattern may
36463 // be generated by the memcmp expansion pass with oversized integer compares
36465 bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
36466 X.getOperand(0).getOpcode() == ISD::XOR &&
36467 X.getOperand(1).getOpcode() == ISD::XOR;
36468 if (isNullConstant(Y) && !IsOrXorXorCCZero)
36471 // Bail out if we know that this is not really just an oversized integer.
36472 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
36473 peekThroughBitcasts(Y).getValueType() == MVT::f128)
36476 // TODO: Use PXOR + PTEST for SSE4.1 or later?
36477 // TODO: Add support for AVX-512.
36478 EVT VT = SetCC->getValueType(0);
36480 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
36481 (OpSize == 256 && Subtarget.hasAVX2())) {
36482 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
36484 if (IsOrXorXorCCZero) {
36485 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
36486 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
36487 // Use 2 vector equality compares and 'and' the results before doing a
36489 SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
36490 SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
36491 SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
36492 SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
36493 SDValue Cmp1 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, A, B);
36494 SDValue Cmp2 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, C, D);
36495 Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
36497 SDValue VecX = DAG.getBitcast(VecVT, X);
36498 SDValue VecY = DAG.getBitcast(VecVT, Y);
36499 Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
36501 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
36502 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
36503 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
36504 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
36505 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
36506 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
36507 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
36509 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
36515 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
36516 const X86Subtarget &Subtarget) {
36517 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
36518 SDValue LHS = N->getOperand(0);
36519 SDValue RHS = N->getOperand(1);
36520 EVT VT = N->getValueType(0);
36523 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
36524 EVT OpVT = LHS.getValueType();
36525 // 0-x == y --> x+y == 0
36526 // 0-x != y --> x+y != 0
36527 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
36529 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
36530 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
36532 // x == 0-y --> x+y == 0
36533 // x != 0-y --> x+y != 0
36534 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
36536 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
36537 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
36540 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
36544 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
36545 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
36546 // Put build_vectors on the right.
36547 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
36548 std::swap(LHS, RHS);
36549 CC = ISD::getSetCCSwappedOperands(CC);
36553 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
36554 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
36555 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
36557 if (IsSEXT0 && IsVZero1) {
36558 assert(VT == LHS.getOperand(0).getValueType() &&
36559 "Uexpected operand type");
36560 if (CC == ISD::SETGT)
36561 return DAG.getConstant(0, DL, VT);
36562 if (CC == ISD::SETLE)
36563 return DAG.getConstant(1, DL, VT);
36564 if (CC == ISD::SETEQ || CC == ISD::SETGE)
36565 return DAG.getNOT(DL, LHS.getOperand(0), VT);
36567 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
36568 "Unexpected condition code!");
36569 return LHS.getOperand(0);
36573 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
36574 // to avoid scalarization via legalization because v4i32 is not a legal type.
36575 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
36576 LHS.getValueType() == MVT::v4f32)
36577 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
36582 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
36583 TargetLowering::DAGCombinerInfo &DCI) {
36584 SDValue Src = N->getOperand(0);
36585 MVT SrcVT = Src.getSimpleValueType();
36587 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36588 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36589 !DCI.isBeforeLegalizeOps());
36591 // MOVMSK only uses the MSB from each vector element.
36593 APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
36594 if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
36595 DCI.AddToWorklist(Src.getNode());
36596 DCI.CommitTargetLoweringOpt(TLO);
36597 return SDValue(N, 0);
36603 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
36604 TargetLowering::DAGCombinerInfo &DCI,
36605 const X86Subtarget &Subtarget) {
36608 if (DCI.isBeforeLegalizeOps()) {
36609 SDValue Index = N->getOperand(4);
36610 // Remove any sign extends from 32 or smaller to larger than 32.
36611 // Only do this before LegalizeOps in case we need the sign extend for
36613 if (Index.getOpcode() == ISD::SIGN_EXTEND) {
36614 if (Index.getScalarValueSizeInBits() > 32 &&
36615 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
36616 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36617 NewOps[4] = Index.getOperand(0);
36618 DAG.UpdateNodeOperands(N, NewOps);
36619 // The original sign extend has less users, add back to worklist in case
36620 // it needs to be removed
36621 DCI.AddToWorklist(Index.getNode());
36622 DCI.AddToWorklist(N);
36623 return SDValue(N, 0);
36627 // Make sure the index is either i32 or i64
36628 unsigned ScalarSize = Index.getScalarValueSizeInBits();
36629 if (ScalarSize != 32 && ScalarSize != 64) {
36630 MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
36631 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
36632 Index.getValueType().getVectorNumElements());
36633 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
36634 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36636 DAG.UpdateNodeOperands(N, NewOps);
36637 DCI.AddToWorklist(N);
36638 return SDValue(N, 0);
36641 // Try to remove zero extends from 32->64 if we know the sign bit of
36642 // the input is zero.
36643 if (Index.getOpcode() == ISD::ZERO_EXTEND &&
36644 Index.getScalarValueSizeInBits() == 64 &&
36645 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
36646 if (DAG.SignBitIsZero(Index.getOperand(0))) {
36647 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36648 NewOps[4] = Index.getOperand(0);
36649 DAG.UpdateNodeOperands(N, NewOps);
36650 // The original zero extend has less users, add back to worklist in case
36651 // it needs to be removed
36652 DCI.AddToWorklist(Index.getNode());
36653 DCI.AddToWorklist(N);
36654 return SDValue(N, 0);
36659 // Gather and Scatter instructions use k-registers for masks. The type of
36660 // the masks is v*i1. So the mask will be truncated anyway.
36661 // The SIGN_EXTEND_INREG my be dropped.
36662 SDValue Mask = N->getOperand(2);
36663 if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
36664 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36665 NewOps[2] = Mask.getOperand(0);
36666 DAG.UpdateNodeOperands(N, NewOps);
36667 return SDValue(N, 0);
36670 // With AVX2 we only demand the upper bit of the mask.
36671 if (!Subtarget.hasAVX512()) {
36672 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36673 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36674 !DCI.isBeforeLegalizeOps());
36676 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
36677 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
36678 DCI.AddToWorklist(Mask.getNode());
36679 DCI.CommitTargetLoweringOpt(TLO);
36680 return SDValue(N, 0);
36687 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
36688 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
36689 const X86Subtarget &Subtarget) {
36691 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
36692 SDValue EFLAGS = N->getOperand(1);
36694 // Try to simplify the EFLAGS and condition code operands.
36695 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
36696 return getSETCC(CC, Flags, DL, DAG);
36701 /// Optimize branch condition evaluation.
36702 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
36703 const X86Subtarget &Subtarget) {
36705 SDValue EFLAGS = N->getOperand(3);
36706 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
36708 // Try to simplify the EFLAGS and condition code operands.
36709 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
36710 // RAUW them under us.
36711 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
36712 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
36713 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
36714 N->getOperand(1), Cond, Flags);
36720 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
36721 SelectionDAG &DAG) {
36722 // Take advantage of vector comparisons producing 0 or -1 in each lane to
36723 // optimize away operation when it's from a constant.
36725 // The general transformation is:
36726 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
36727 // AND(VECTOR_CMP(x,y), constant2)
36728 // constant2 = UNARYOP(constant)
36730 // Early exit if this isn't a vector operation, the operand of the
36731 // unary operation isn't a bitwise AND, or if the sizes of the operations
36732 // aren't the same.
36733 EVT VT = N->getValueType(0);
36734 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
36735 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
36736 VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
36739 // Now check that the other operand of the AND is a constant. We could
36740 // make the transformation for non-constant splats as well, but it's unclear
36741 // that would be a benefit as it would not eliminate any operations, just
36742 // perform one more step in scalar code before moving to the vector unit.
36743 if (BuildVectorSDNode *BV =
36744 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
36745 // Bail out if the vector isn't a constant.
36746 if (!BV->isConstant())
36749 // Everything checks out. Build up the new and improved node.
36751 EVT IntVT = BV->getValueType(0);
36752 // Create a new constant of the appropriate type for the transformed
36754 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
36755 // The AND node needs bitcasts to/from an integer vector type around it.
36756 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
36757 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
36758 N->getOperand(0)->getOperand(0), MaskConst);
36759 SDValue Res = DAG.getBitcast(VT, NewAnd);
36766 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
36767 const X86Subtarget &Subtarget) {
36768 SDValue Op0 = N->getOperand(0);
36769 EVT VT = N->getValueType(0);
36770 EVT InVT = Op0.getValueType();
36771 EVT InSVT = InVT.getScalarType();
36773 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
36774 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
36775 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
36777 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36778 InVT.getVectorNumElements());
36779 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
36781 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
36782 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
36785 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
36786 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
36787 // the optimization here.
36788 if (DAG.SignBitIsZero(Op0))
36789 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
36794 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
36795 const X86Subtarget &Subtarget) {
36796 // First try to optimize away the conversion entirely when it's
36797 // conditionally from a constant. Vectors only.
36798 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
36801 // Now move on to more general possibilities.
36802 SDValue Op0 = N->getOperand(0);
36803 EVT VT = N->getValueType(0);
36804 EVT InVT = Op0.getValueType();
36805 EVT InSVT = InVT.getScalarType();
36807 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
36808 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
36809 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
36810 if (InVT.isVector() &&
36811 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
36812 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
36814 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36815 InVT.getVectorNumElements());
36816 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
36817 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
36820 // Without AVX512DQ we only support i64 to float scalar conversion. For both
36821 // vectors and scalars, see if we know that the upper bits are all the sign
36822 // bit, in which case we can truncate the input to i32 and convert from that.
36823 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
36824 unsigned BitWidth = InVT.getScalarSizeInBits();
36825 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
36826 if (NumSignBits >= (BitWidth - 31)) {
36827 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
36828 if (InVT.isVector())
36829 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
36830 InVT.getVectorNumElements());
36832 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
36833 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
36837 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
36838 // a 32-bit target where SSE doesn't support i64->FP operations.
36839 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
36840 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
36841 EVT LdVT = Ld->getValueType(0);
36843 // This transformation is not supported if the result type is f16 or f128.
36844 if (VT == MVT::f16 || VT == MVT::f128)
36847 if (!Ld->isVolatile() && !VT.isVector() &&
36848 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
36849 !Subtarget.is64Bit() && LdVT == MVT::i64) {
36850 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
36851 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
36852 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
36859 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
36860 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
36861 MVT VT = N->getSimpleValueType(0);
36862 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36863 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
36864 N->getOperand(0), N->getOperand(1),
36871 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
36872 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
36873 TargetLowering::DAGCombinerInfo &DCI) {
36874 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
36875 // the result is either zero or one (depending on the input carry bit).
36876 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
36877 if (X86::isZeroNode(N->getOperand(0)) &&
36878 X86::isZeroNode(N->getOperand(1)) &&
36879 // We don't have a good way to replace an EFLAGS use, so only do this when
36881 SDValue(N, 1).use_empty()) {
36883 EVT VT = N->getValueType(0);
36884 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
36885 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
36886 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36887 DAG.getConstant(X86::COND_B, DL,
36890 DAG.getConstant(1, DL, VT));
36891 return DCI.CombineTo(N, Res1, CarryOut);
36894 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
36895 MVT VT = N->getSimpleValueType(0);
36896 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36897 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
36898 N->getOperand(0), N->getOperand(1),
36905 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
36906 /// which is more useful than 0/1 in some cases.
36907 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
36909 // "Condition code B" is also known as "the carry flag" (CF).
36910 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
36911 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
36912 MVT VT = N->getSimpleValueType(0);
36914 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
36916 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
36917 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
36920 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
36921 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
36922 /// with CMP+{ADC, SBB}.
36923 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
36924 bool IsSub = N->getOpcode() == ISD::SUB;
36925 SDValue X = N->getOperand(0);
36926 SDValue Y = N->getOperand(1);
36928 // If this is an add, canonicalize a zext operand to the RHS.
36929 // TODO: Incomplete? What if both sides are zexts?
36930 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
36931 Y.getOpcode() != ISD::ZERO_EXTEND)
36934 // Look through a one-use zext.
36935 bool PeekedThroughZext = false;
36936 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
36937 Y = Y.getOperand(0);
36938 PeekedThroughZext = true;
36941 // If this is an add, canonicalize a setcc operand to the RHS.
36942 // TODO: Incomplete? What if both sides are setcc?
36943 // TODO: Should we allow peeking through a zext of the other operand?
36944 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
36945 Y.getOpcode() != X86ISD::SETCC)
36948 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
36952 EVT VT = N->getValueType(0);
36953 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
36955 // If X is -1 or 0, then we have an opportunity to avoid constants required in
36956 // the general case below.
36957 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
36959 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
36960 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
36961 // This is a complicated way to get -1 or 0 from the carry flag:
36962 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
36963 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
36964 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36965 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36969 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
36970 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
36971 SDValue EFLAGS = Y->getOperand(1);
36972 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
36973 EFLAGS.getValueType().isInteger() &&
36974 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
36975 // Swap the operands of a SUB, and we have the same pattern as above.
36976 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
36977 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
36978 SDValue NewSub = DAG.getNode(
36979 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
36980 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
36981 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
36982 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36983 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36989 if (CC == X86::COND_B) {
36990 // X + SETB Z --> X + (mask SBB Z, Z)
36991 // X - SETB Z --> X - (mask SBB Z, Z)
36992 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
36993 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
36994 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
36995 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
36996 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
36999 if (CC == X86::COND_A) {
37000 SDValue EFLAGS = Y->getOperand(1);
37001 // Try to convert COND_A into COND_B in an attempt to facilitate
37002 // materializing "setb reg".
37004 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
37005 // cannot take an immediate as its first operand.
37007 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
37008 EFLAGS.getValueType().isInteger() &&
37009 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
37010 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
37011 EFLAGS.getNode()->getVTList(),
37012 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
37013 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
37014 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
37015 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
37016 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
37017 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
37021 if (CC != X86::COND_E && CC != X86::COND_NE)
37024 SDValue Cmp = Y.getOperand(1);
37025 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
37026 !X86::isZeroNode(Cmp.getOperand(1)) ||
37027 !Cmp.getOperand(0).getValueType().isInteger())
37030 SDValue Z = Cmp.getOperand(0);
37031 EVT ZVT = Z.getValueType();
37033 // If X is -1 or 0, then we have an opportunity to avoid constants required in
37034 // the general case below.
37036 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
37038 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
37039 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
37040 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
37041 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
37042 SDValue Zero = DAG.getConstant(0, DL, ZVT);
37043 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
37044 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
37045 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37046 DAG.getConstant(X86::COND_B, DL, MVT::i8),
37047 SDValue(Neg.getNode(), 1));
37050 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
37051 // with fake operands:
37052 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
37053 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
37054 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
37055 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
37056 SDValue One = DAG.getConstant(1, DL, ZVT);
37057 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
37058 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37059 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
37063 // (cmp Z, 1) sets the carry flag if Z is 0.
37064 SDValue One = DAG.getConstant(1, DL, ZVT);
37065 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
37067 // Add the flags type for ADC/SBB nodes.
37068 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
37070 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
37071 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
37072 if (CC == X86::COND_NE)
37073 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
37074 DAG.getConstant(-1ULL, DL, VT), Cmp1);
37076 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
37077 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
37078 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
37079 DAG.getConstant(0, DL, VT), Cmp1);
37082 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
37083 const X86Subtarget &Subtarget) {
37084 if (!Subtarget.hasSSE2())
37087 SDValue MulOp = N->getOperand(0);
37088 SDValue Phi = N->getOperand(1);
37090 if (MulOp.getOpcode() != ISD::MUL)
37091 std::swap(MulOp, Phi);
37092 if (MulOp.getOpcode() != ISD::MUL)
37096 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
37099 EVT VT = N->getValueType(0);
37101 unsigned RegSize = 128;
37102 if (Subtarget.hasBWI())
37104 else if (Subtarget.hasAVX2())
37106 unsigned VectorSize = VT.getVectorNumElements() * 16;
37107 // If the vector size is less than 128, or greater than the supported RegSize,
37108 // do not use PMADD.
37109 if (VectorSize < 128 || VectorSize > RegSize)
37113 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
37114 VT.getVectorNumElements());
37115 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
37116 VT.getVectorNumElements() / 2);
37118 // Shrink the operands of mul.
37119 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
37120 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
37122 // Madd vector size is half of the original vector size
37123 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
37124 // Fill the rest of the output with 0
37125 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
37126 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
37127 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
37130 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
37131 const X86Subtarget &Subtarget) {
37132 if (!Subtarget.hasSSE2())
37136 EVT VT = N->getValueType(0);
37137 SDValue Op0 = N->getOperand(0);
37138 SDValue Op1 = N->getOperand(1);
37140 // TODO: There's nothing special about i32, any integer type above i16 should
37141 // work just as well.
37142 if (!VT.isVector() || !VT.isSimple() ||
37143 !(VT.getVectorElementType() == MVT::i32))
37146 unsigned RegSize = 128;
37147 if (Subtarget.hasBWI())
37149 else if (Subtarget.hasAVX2())
37152 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
37153 // TODO: We should be able to handle larger vectors by splitting them before
37154 // feeding them into several SADs, and then reducing over those.
37155 if (VT.getSizeInBits() / 4 > RegSize)
37158 // We know N is a reduction add, which means one of its operands is a phi.
37159 // To match SAD, we need the other operand to be a vector select.
37160 SDValue SelectOp, Phi;
37161 if (Op0.getOpcode() == ISD::VSELECT) {
37164 } else if (Op1.getOpcode() == ISD::VSELECT) {
37170 // Check whether we have an abs-diff pattern feeding into the select.
37171 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
37174 // SAD pattern detected. Now build a SAD instruction and an addition for
37175 // reduction. Note that the number of elements of the result of SAD is less
37176 // than the number of elements of its input. Therefore, we could only update
37177 // part of elements in the reduction vector.
37178 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
37180 // The output of PSADBW is a vector of i64.
37181 // We need to turn the vector of i64 into a vector of i32.
37182 // If the reduction vector is at least as wide as the psadbw result, just
37183 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
37185 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
37186 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
37187 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
37189 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
37191 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
37192 // Fill the upper elements with zero to match the add width.
37193 SDValue Zero = DAG.getConstant(0, DL, VT);
37194 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
37195 DAG.getIntPtrConstant(0, DL));
37198 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
37201 /// Convert vector increment or decrement to sub/add with an all-ones constant:
37202 /// add X, <1, 1...> --> sub X, <-1, -1...>
37203 /// sub X, <1, 1...> --> add X, <-1, -1...>
37204 /// The all-ones vector constant can be materialized using a pcmpeq instruction
37205 /// that is commonly recognized as an idiom (has no register dependency), so
37206 /// that's better/smaller than loading a splat 1 constant.
37207 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
37208 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
37209 "Unexpected opcode for increment/decrement transform");
37211 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
37212 // out and wait for legalization if we have an unsupported vector length.
37213 EVT VT = N->getValueType(0);
37214 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
37217 SDNode *N1 = N->getOperand(1).getNode();
37219 if (!ISD::isConstantSplatVector(N1, SplatVal) ||
37220 !SplatVal.isOneValue())
37223 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
37224 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
37225 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
37228 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
37229 const X86Subtarget &Subtarget) {
37230 const SDNodeFlags Flags = N->getFlags();
37231 if (Flags.hasVectorReduction()) {
37232 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
37234 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
37237 EVT VT = N->getValueType(0);
37238 SDValue Op0 = N->getOperand(0);
37239 SDValue Op1 = N->getOperand(1);
37241 // Try to synthesize horizontal adds from adds of shuffles.
37242 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
37243 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
37244 isHorizontalBinOp(Op0, Op1, true))
37245 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
37247 if (SDValue V = combineIncDecVector(N, DAG))
37250 return combineAddOrSubToADCOrSBB(N, DAG);
37253 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
37254 const X86Subtarget &Subtarget) {
37255 SDValue Op0 = N->getOperand(0);
37256 SDValue Op1 = N->getOperand(1);
37257 EVT VT = N->getValueType(0);
37259 // PSUBUS is supported, starting from SSE2, but special preprocessing
37260 // for v8i32 requires umin, which appears in SSE41.
37261 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
37262 !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
37263 !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
37264 !(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
37265 (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||
37266 VT == MVT::v8i64)))
37269 SDValue SubusLHS, SubusRHS;
37270 // Try to find umax(a,b) - b or a - umin(a,b) patterns
37271 // they may be converted to subus(a,b).
37272 // TODO: Need to add IR cannonicialization for this code.
37273 if (Op0.getOpcode() == ISD::UMAX) {
37275 SDValue MaxLHS = Op0.getOperand(0);
37276 SDValue MaxRHS = Op0.getOperand(1);
37279 else if (MaxRHS == Op1)
37283 } else if (Op1.getOpcode() == ISD::UMIN) {
37285 SDValue MinLHS = Op1.getOperand(0);
37286 SDValue MinRHS = Op1.getOperand(1);
37289 else if (MinRHS == Op0)
37296 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
37297 // special preprocessing in some cases.
37298 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
37299 return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);
37301 // Special preprocessing case can be only applied
37302 // if the value was zero extended from 16 bit,
37303 // so we require first 16 bits to be zeros for 32 bit
37304 // values, or first 48 bits for 64 bit values.
37306 DAG.computeKnownBits(SubusLHS, Known);
37307 unsigned NumZeros = Known.countMinLeadingZeros();
37308 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
37311 EVT ExtType = SubusLHS.getValueType();
37313 if (VT == MVT::v8i32 || VT == MVT::v8i64)
37314 ShrinkedType = MVT::v8i16;
37316 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
37318 // If SubusLHS is zeroextended - truncate SubusRHS to it's
37319 // size SubusRHS = umin(0xFFF.., SubusRHS).
37320 SDValue SaturationConst =
37321 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
37322 ShrinkedType.getScalarSizeInBits()),
37323 SDLoc(SubusLHS), ExtType);
37324 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
37326 SDValue NewSubusLHS =
37327 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
37328 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
37329 SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
37330 NewSubusLHS, NewSubusRHS);
37331 // Zero extend the result, it may be used somewhere as 32 bit,
37332 // if not zext and following trunc will shrink.
37333 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
37336 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
37337 const X86Subtarget &Subtarget) {
37338 SDValue Op0 = N->getOperand(0);
37339 SDValue Op1 = N->getOperand(1);
37341 // X86 can't encode an immediate LHS of a sub. See if we can push the
37342 // negation into a preceding instruction.
37343 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
37344 // If the RHS of the sub is a XOR with one use and a constant, invert the
37345 // immediate. Then add one to the LHS of the sub so we can turn
37346 // X-Y -> X+~Y+1, saving one register.
37347 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
37348 isa<ConstantSDNode>(Op1.getOperand(1))) {
37349 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
37350 EVT VT = Op0.getValueType();
37351 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
37353 DAG.getConstant(~XorC, SDLoc(Op1), VT));
37354 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
37355 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
37359 // Try to synthesize horizontal subs from subs of shuffles.
37360 EVT VT = N->getValueType(0);
37361 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
37362 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
37363 isHorizontalBinOp(Op0, Op1, false))
37364 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
37366 if (SDValue V = combineIncDecVector(N, DAG))
37369 // Try to create PSUBUS if SUB's argument is max/min
37370 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
37373 return combineAddOrSubToADCOrSBB(N, DAG);
37376 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
37377 TargetLowering::DAGCombinerInfo &DCI,
37378 const X86Subtarget &Subtarget) {
37379 if (DCI.isBeforeLegalize())
37383 unsigned Opcode = N->getOpcode();
37384 MVT VT = N->getSimpleValueType(0);
37385 MVT SVT = VT.getVectorElementType();
37386 unsigned NumElts = VT.getVectorNumElements();
37387 unsigned EltSizeInBits = SVT.getSizeInBits();
37389 SDValue Op = N->getOperand(0);
37390 MVT OpVT = Op.getSimpleValueType();
37391 MVT OpEltVT = OpVT.getVectorElementType();
37392 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
37393 unsigned InputBits = OpEltSizeInBits * NumElts;
37395 // Perform any constant folding.
37396 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
37398 SmallVector<APInt, 64> EltBits;
37399 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
37400 APInt Undefs(NumElts, 0);
37401 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
37403 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
37404 for (unsigned i = 0; i != NumElts; ++i) {
37405 if (UndefElts[i]) {
37409 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
37410 : EltBits[i].sextOrTrunc(EltSizeInBits);
37412 return getConstVector(Vals, Undefs, VT, DAG, DL);
37415 // (vzext (bitcast (vzext (x)) -> (vzext x)
37416 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
37417 SDValue V = peekThroughBitcasts(Op);
37418 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
37419 MVT InnerVT = V.getSimpleValueType();
37420 MVT InnerEltVT = InnerVT.getVectorElementType();
37422 // If the element sizes match exactly, we can just do one larger vzext. This
37423 // is always an exact type match as vzext operates on integer types.
37424 if (OpEltVT == InnerEltVT) {
37425 assert(OpVT == InnerVT && "Types must match for vzext!");
37426 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
37429 // The only other way we can combine them is if only a single element of the
37430 // inner vzext is used in the input to the outer vzext.
37431 if (InnerEltVT.getSizeInBits() < InputBits)
37434 // In this case, the inner vzext is completely dead because we're going to
37435 // only look at bits inside of the low element. Just do the outer vzext on
37436 // a bitcast of the input to the inner.
37437 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
37440 // Check if we can bypass extracting and re-inserting an element of an input
37441 // vector. Essentially:
37442 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
37443 // TODO: Add X86ISD::VSEXT support
37444 if (Opcode == X86ISD::VZEXT &&
37445 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37446 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
37447 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
37448 SDValue ExtractedV = V.getOperand(0);
37449 SDValue OrigV = ExtractedV.getOperand(0);
37450 if (isNullConstant(ExtractedV.getOperand(1))) {
37451 MVT OrigVT = OrigV.getSimpleValueType();
37452 // Extract a subvector if necessary...
37453 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
37454 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
37455 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
37456 OrigVT.getVectorNumElements() / Ratio);
37457 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
37458 DAG.getIntPtrConstant(0, DL));
37460 Op = DAG.getBitcast(OpVT, OrigV);
37461 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
37468 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
37469 const X86Subtarget &Subtarget) {
37470 SDValue Op0 = N->getOperand(0);
37471 SDValue Op1 = N->getOperand(1);
37473 MVT VT = N->getSimpleValueType(0);
37476 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
37477 if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)
37478 return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
37479 Op0->getOperand(1));
37481 // TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
37482 // TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
37483 if (ISD::isBuildVectorAllZeros(Op0.getNode()) ||
37484 ISD::isBuildVectorAllZeros(Op1.getNode()))
37485 return getZeroVector(VT, Subtarget, DAG, DL);
37490 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
37491 const X86Subtarget &Subtarget) {
37492 MVT VT = N->getSimpleValueType(0);
37495 if (N->getOperand(0) == N->getOperand(1)) {
37496 if (N->getOpcode() == X86ISD::PCMPEQ)
37497 return getOnesVector(VT, DAG, DL);
37498 if (N->getOpcode() == X86ISD::PCMPGT)
37499 return getZeroVector(VT, Subtarget, DAG, DL);
37505 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
37506 TargetLowering::DAGCombinerInfo &DCI,
37507 const X86Subtarget &Subtarget) {
37508 if (DCI.isBeforeLegalizeOps())
37511 MVT OpVT = N->getSimpleValueType(0);
37513 // Early out for mask vectors.
37514 if (OpVT.getVectorElementType() == MVT::i1)
37518 SDValue Vec = N->getOperand(0);
37519 SDValue SubVec = N->getOperand(1);
37521 unsigned IdxVal = N->getConstantOperandVal(2);
37522 MVT SubVecVT = SubVec.getSimpleValueType();
37524 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
37525 // Inserting zeros into zeros is a nop.
37526 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
37529 // If we're inserting into a zero vector and then into a larger zero vector,
37530 // just insert into the larger zero vector directly.
37531 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
37532 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
37533 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
37534 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
37535 SubVec.getOperand(1),
37536 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
37539 // If we're inserting a bitcast into zeros, rewrite the insert and move the
37540 // bitcast to the other side. This helps with detecting zero extending
37542 // TODO: Is this useful for other indices than 0?
37543 if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
37544 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
37545 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
37546 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
37547 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
37548 DAG.getBitcast(NewVT, Vec),
37549 SubVec.getOperand(0), N->getOperand(2));
37550 return DAG.getBitcast(OpVT, Insert);
37554 // If this is an insert of an extract, combine to a shuffle. Don't do this
37555 // if the insert or extract can be represented with a subregister operation.
37556 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
37557 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
37558 (IdxVal != 0 || !Vec.isUndef())) {
37559 int ExtIdxVal = SubVec.getConstantOperandVal(1);
37560 if (ExtIdxVal != 0) {
37561 int VecNumElts = OpVT.getVectorNumElements();
37562 int SubVecNumElts = SubVecVT.getVectorNumElements();
37563 SmallVector<int, 64> Mask(VecNumElts);
37564 // First create an identity shuffle mask.
37565 for (int i = 0; i != VecNumElts; ++i)
37567 // Now insert the extracted portion.
37568 for (int i = 0; i != SubVecNumElts; ++i)
37569 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
37571 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
37575 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
37577 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
37578 // (load16 addr + 16), Elts/2)
37581 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
37582 // (load32 addr + 32), Elts/2)
37584 // or a 16-byte or 32-byte broadcast:
37585 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
37586 // (load16 addr), Elts/2)
37587 // --> X86SubVBroadcast(load16 addr)
37589 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
37590 // (load32 addr), Elts/2)
37591 // --> X86SubVBroadcast(load32 addr)
37592 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
37593 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
37594 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
37595 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
37596 if (Idx2 && Idx2->getZExtValue() == 0) {
37597 SDValue SubVec2 = Vec.getOperand(1);
37598 // If needed, look through bitcasts to get to the load.
37599 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
37601 unsigned Alignment = FirstLd->getAlignment();
37602 unsigned AS = FirstLd->getAddressSpace();
37603 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
37604 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
37605 OpVT, AS, Alignment, &Fast) && Fast) {
37606 SDValue Ops[] = {SubVec2, SubVec};
37607 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
37612 // If lower/upper loads are the same and the only users of the load, then
37613 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
37614 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
37615 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
37616 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
37617 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
37619 // If this is subv_broadcast insert into both halves, use a larger
37621 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
37622 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
37623 SubVec.getOperand(0));
37625 // If we're inserting all zeros into the upper half, change this to
37626 // an insert into an all zeros vector. We will match this to a move
37627 // with implicit upper bit zeroing during isel.
37628 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
37629 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
37630 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
37631 Vec.getOperand(2));
37633 // If we are inserting into both halves of the vector, the starting
37634 // vector should be undef. If it isn't, make it so. Only do this if the
37635 // the early insert has no other uses.
37636 // TODO: Should this be a generic DAG combine?
37637 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
37638 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
37639 SubVec2, Vec.getOperand(2));
37640 DCI.AddToWorklist(Vec.getNode());
37641 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
37651 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
37652 TargetLowering::DAGCombinerInfo &DCI,
37653 const X86Subtarget &Subtarget) {
37654 if (DCI.isBeforeLegalizeOps())
37657 MVT OpVT = N->getSimpleValueType(0);
37658 SDValue InVec = N->getOperand(0);
37659 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
37661 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
37662 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
37664 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
37665 if (OpVT.getScalarType() == MVT::i1)
37666 return DAG.getConstant(1, SDLoc(N), OpVT);
37667 return getOnesVector(OpVT, DAG, SDLoc(N));
37670 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
37671 return DAG.getBuildVector(
37673 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
37678 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
37679 DAGCombinerInfo &DCI) const {
37680 SelectionDAG &DAG = DCI.DAG;
37681 switch (N->getOpcode()) {
37683 case ISD::EXTRACT_VECTOR_ELT:
37684 case X86ISD::PEXTRW:
37685 case X86ISD::PEXTRB:
37686 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
37687 case ISD::INSERT_SUBVECTOR:
37688 return combineInsertSubvector(N, DAG, DCI, Subtarget);
37689 case ISD::EXTRACT_SUBVECTOR:
37690 return combineExtractSubvector(N, DAG, DCI, Subtarget);
37693 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
37694 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
37695 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
37696 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
37697 case ISD::SUB: return combineSub(N, DAG, Subtarget);
37698 case X86ISD::SBB: return combineSBB(N, DAG);
37699 case X86ISD::ADC: return combineADC(N, DAG, DCI);
37700 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
37703 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
37704 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
37705 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
37706 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
37707 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
37708 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
37709 case ISD::STORE: return combineStore(N, DAG, Subtarget);
37710 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
37711 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
37712 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
37714 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
37715 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
37716 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
37717 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
37718 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
37719 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
37721 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
37723 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
37725 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
37726 case X86ISD::BT: return combineBT(N, DAG, DCI);
37727 case ISD::ANY_EXTEND:
37728 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
37729 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
37730 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
37731 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
37732 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
37733 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
37734 case X86ISD::PACKSS:
37735 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
37736 case X86ISD::VSHLI:
37737 case X86ISD::VSRAI:
37738 case X86ISD::VSRLI:
37739 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
37740 case ISD::SIGN_EXTEND_VECTOR_INREG:
37741 case ISD::ZERO_EXTEND_VECTOR_INREG:
37742 case X86ISD::VSEXT:
37743 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
37744 case X86ISD::PINSRB:
37745 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
37746 case X86ISD::SHUFP: // Handle all target specific shuffles
37747 case X86ISD::INSERTPS:
37748 case X86ISD::EXTRQI:
37749 case X86ISD::INSERTQI:
37750 case X86ISD::PALIGNR:
37751 case X86ISD::VSHLDQ:
37752 case X86ISD::VSRLDQ:
37753 case X86ISD::BLENDI:
37754 case X86ISD::UNPCKH:
37755 case X86ISD::UNPCKL:
37756 case X86ISD::MOVHLPS:
37757 case X86ISD::MOVLHPS:
37758 case X86ISD::PSHUFB:
37759 case X86ISD::PSHUFD:
37760 case X86ISD::PSHUFHW:
37761 case X86ISD::PSHUFLW:
37762 case X86ISD::MOVSHDUP:
37763 case X86ISD::MOVSLDUP:
37764 case X86ISD::MOVDDUP:
37765 case X86ISD::MOVSS:
37766 case X86ISD::MOVSD:
37767 case X86ISD::VBROADCAST:
37768 case X86ISD::VPPERM:
37769 case X86ISD::VPERMI:
37770 case X86ISD::VPERMV:
37771 case X86ISD::VPERMV3:
37772 case X86ISD::VPERMIV3:
37773 case X86ISD::VPERMIL2:
37774 case X86ISD::VPERMILPI:
37775 case X86ISD::VPERMILPV:
37776 case X86ISD::VPERM2X128:
37777 case X86ISD::VZEXT_MOVL:
37778 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
37779 case X86ISD::FMADD_RND:
37780 case X86ISD::FMADDS1_RND:
37781 case X86ISD::FMADDS3_RND:
37782 case X86ISD::FMADDS1:
37783 case X86ISD::FMADDS3:
37784 case X86ISD::FMADD4S:
37785 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
37786 case X86ISD::FMADDSUB_RND:
37787 case X86ISD::FMSUBADD_RND:
37788 case X86ISD::FMADDSUB:
37789 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
37790 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
37791 case X86ISD::MGATHER:
37792 case X86ISD::MSCATTER:
37794 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
37795 case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget);
37796 case X86ISD::PCMPEQ:
37797 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
37803 /// Return true if the target has native support for the specified value type
37804 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
37805 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
37806 /// some i16 instructions are slow.
37807 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
37808 if (!isTypeLegal(VT))
37810 if (VT != MVT::i16)
37817 case ISD::SIGN_EXTEND:
37818 case ISD::ZERO_EXTEND:
37819 case ISD::ANY_EXTEND:
37832 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
37833 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
37834 /// we don't adjust the stack we clobber the first frame index.
37835 /// See X86InstrInfo::copyPhysReg.
37836 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
37837 const MachineRegisterInfo &MRI = MF.getRegInfo();
37838 return any_of(MRI.reg_instructions(X86::EFLAGS),
37839 [](const MachineInstr &RI) { return RI.isCopy(); });
37842 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
37843 if (hasCopyImplyingStackAdjustment(MF)) {
37844 MachineFrameInfo &MFI = MF.getFrameInfo();
37845 MFI.setHasCopyImplyingStackAdjustment(true);
37848 TargetLoweringBase::finalizeLowering(MF);
37851 /// This method query the target whether it is beneficial for dag combiner to
37852 /// promote the specified node. If true, it should return the desired promotion
37853 /// type by reference.
37854 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
37855 EVT VT = Op.getValueType();
37856 if (VT != MVT::i16)
37859 bool Promote = false;
37860 bool Commute = false;
37861 switch (Op.getOpcode()) {
37863 case ISD::SIGN_EXTEND:
37864 case ISD::ZERO_EXTEND:
37865 case ISD::ANY_EXTEND:
37870 SDValue N0 = Op.getOperand(0);
37871 // Look out for (store (shl (load), x)).
37872 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
37885 SDValue N0 = Op.getOperand(0);
37886 SDValue N1 = Op.getOperand(1);
37887 if (!Commute && MayFoldLoad(N1))
37889 // Avoid disabling potential load folding opportunities.
37890 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
37892 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
37902 bool X86TargetLowering::
37903 isDesirableToCombineBuildVectorToShuffleTruncate(
37904 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
37906 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
37907 "Element count mismatch");
37909 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
37910 "Shuffle Mask expected to be legal");
37912 // For 32-bit elements VPERMD is better than shuffle+truncate.
37913 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
37914 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
37917 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
37923 //===----------------------------------------------------------------------===//
37924 // X86 Inline Assembly Support
37925 //===----------------------------------------------------------------------===//
37927 // Helper to match a string separated by whitespace.
37928 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
37929 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
37931 for (StringRef Piece : Pieces) {
37932 if (!S.startswith(Piece)) // Check if the piece matches.
37935 S = S.substr(Piece.size());
37936 StringRef::size_type Pos = S.find_first_not_of(" \t");
37937 if (Pos == 0) // We matched a prefix.
37946 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
37948 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
37949 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
37950 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
37951 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
37953 if (AsmPieces.size() == 3)
37955 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
37962 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
37963 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
37965 const std::string &AsmStr = IA->getAsmString();
37967 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
37968 if (!Ty || Ty->getBitWidth() % 16 != 0)
37971 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
37972 SmallVector<StringRef, 4> AsmPieces;
37973 SplitString(AsmStr, AsmPieces, ";\n");
37975 switch (AsmPieces.size()) {
37976 default: return false;
37978 // FIXME: this should verify that we are targeting a 486 or better. If not,
37979 // we will turn this bswap into something that will be lowered to logical
37980 // ops instead of emitting the bswap asm. For now, we don't support 486 or
37981 // lower so don't worry about this.
37983 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
37984 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
37985 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
37986 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
37987 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
37988 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
37989 // No need to check constraints, nothing other than the equivalent of
37990 // "=r,0" would be valid here.
37991 return IntrinsicLowering::LowerToByteSwap(CI);
37994 // rorw $$8, ${0:w} --> llvm.bswap.i16
37995 if (CI->getType()->isIntegerTy(16) &&
37996 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
37997 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
37998 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
38000 StringRef ConstraintsStr = IA->getConstraintString();
38001 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
38002 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
38003 if (clobbersFlagRegisters(AsmPieces))
38004 return IntrinsicLowering::LowerToByteSwap(CI);
38008 if (CI->getType()->isIntegerTy(32) &&
38009 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
38010 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
38011 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
38012 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
38014 StringRef ConstraintsStr = IA->getConstraintString();
38015 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
38016 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
38017 if (clobbersFlagRegisters(AsmPieces))
38018 return IntrinsicLowering::LowerToByteSwap(CI);
38021 if (CI->getType()->isIntegerTy(64)) {
38022 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
38023 if (Constraints.size() >= 2 &&
38024 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
38025 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
38026 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
38027 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
38028 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
38029 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
38030 return IntrinsicLowering::LowerToByteSwap(CI);
38038 /// Given a constraint letter, return the type of constraint for this target.
38039 X86TargetLowering::ConstraintType
38040 X86TargetLowering::getConstraintType(StringRef Constraint) const {
38041 if (Constraint.size() == 1) {
38042 switch (Constraint[0]) {
38054 case 'k': // AVX512 masking registers.
38055 return C_RegisterClass;
38079 else if (Constraint.size() == 2) {
38080 switch (Constraint[0]) {
38084 switch (Constraint[1]) {
38095 return C_RegisterClass;
38099 return TargetLowering::getConstraintType(Constraint);
38102 /// Examine constraint type and operand type and determine a weight value.
38103 /// This object must already have been set up with the operand type
38104 /// and the current alternative constraint selected.
38105 TargetLowering::ConstraintWeight
38106 X86TargetLowering::getSingleConstraintMatchWeight(
38107 AsmOperandInfo &info, const char *constraint) const {
38108 ConstraintWeight weight = CW_Invalid;
38109 Value *CallOperandVal = info.CallOperandVal;
38110 // If we don't have a value, we can't do a match,
38111 // but allow it at the lowest weight.
38112 if (!CallOperandVal)
38114 Type *type = CallOperandVal->getType();
38115 // Look at the constraint type.
38116 switch (*constraint) {
38118 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
38130 if (CallOperandVal->getType()->isIntegerTy())
38131 weight = CW_SpecificReg;
38136 if (type->isFloatingPointTy())
38137 weight = CW_SpecificReg;
38140 if (type->isX86_MMXTy() && Subtarget.hasMMX())
38141 weight = CW_SpecificReg;
38144 unsigned Size = StringRef(constraint).size();
38145 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
38146 char NextChar = Size == 2 ? constraint[1] : 'i';
38149 switch (NextChar) {
38155 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
38156 return CW_SpecificReg;
38158 // Conditional OpMask regs (AVX512)
38160 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
38161 return CW_Register;
38165 if (type->isX86_MMXTy() && Subtarget.hasMMX())
38168 // Any SSE reg when ISA >= SSE2, same as 'Y'
38172 if (!Subtarget.hasSSE2())
38176 // Fall through (handle "Y" constraint).
38180 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
38181 weight = CW_Register;
38184 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
38185 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
38186 weight = CW_Register;
38189 // Enable conditional vector operations using %k<#> registers.
38190 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
38191 weight = CW_Register;
38194 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
38195 if (C->getZExtValue() <= 31)
38196 weight = CW_Constant;
38200 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38201 if (C->getZExtValue() <= 63)
38202 weight = CW_Constant;
38206 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38207 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
38208 weight = CW_Constant;
38212 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38213 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
38214 weight = CW_Constant;
38218 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38219 if (C->getZExtValue() <= 3)
38220 weight = CW_Constant;
38224 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38225 if (C->getZExtValue() <= 0xff)
38226 weight = CW_Constant;
38231 if (isa<ConstantFP>(CallOperandVal)) {
38232 weight = CW_Constant;
38236 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38237 if ((C->getSExtValue() >= -0x80000000LL) &&
38238 (C->getSExtValue() <= 0x7fffffffLL))
38239 weight = CW_Constant;
38243 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38244 if (C->getZExtValue() <= 0xffffffff)
38245 weight = CW_Constant;
38252 /// Try to replace an X constraint, which matches anything, with another that
38253 /// has more specific requirements based on the type of the corresponding
38255 const char *X86TargetLowering::
38256 LowerXConstraint(EVT ConstraintVT) const {
38257 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
38258 // 'f' like normal targets.
38259 if (ConstraintVT.isFloatingPoint()) {
38260 if (Subtarget.hasSSE2())
38262 if (Subtarget.hasSSE1())
38266 return TargetLowering::LowerXConstraint(ConstraintVT);
38269 /// Lower the specified operand into the Ops vector.
38270 /// If it is invalid, don't add anything to Ops.
38271 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
38272 std::string &Constraint,
38273 std::vector<SDValue>&Ops,
38274 SelectionDAG &DAG) const {
38277 // Only support length 1 constraints for now.
38278 if (Constraint.length() > 1) return;
38280 char ConstraintLetter = Constraint[0];
38281 switch (ConstraintLetter) {
38284 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38285 if (C->getZExtValue() <= 31) {
38286 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38287 Op.getValueType());
38293 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38294 if (C->getZExtValue() <= 63) {
38295 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38296 Op.getValueType());
38302 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38303 if (isInt<8>(C->getSExtValue())) {
38304 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38305 Op.getValueType());
38311 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38312 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
38313 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
38314 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
38315 Op.getValueType());
38321 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38322 if (C->getZExtValue() <= 3) {
38323 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38324 Op.getValueType());
38330 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38331 if (C->getZExtValue() <= 255) {
38332 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38333 Op.getValueType());
38339 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38340 if (C->getZExtValue() <= 127) {
38341 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38342 Op.getValueType());
38348 // 32-bit signed value
38349 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38350 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
38351 C->getSExtValue())) {
38352 // Widen to 64 bits here to get it sign extended.
38353 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
38356 // FIXME gcc accepts some relocatable values here too, but only in certain
38357 // memory models; it's complicated.
38362 // 32-bit unsigned value
38363 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38364 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
38365 C->getZExtValue())) {
38366 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38367 Op.getValueType());
38371 // FIXME gcc accepts some relocatable values here too, but only in certain
38372 // memory models; it's complicated.
38376 // Literal immediates are always ok.
38377 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
38378 // Widen to 64 bits here to get it sign extended.
38379 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
38383 // In any sort of PIC mode addresses need to be computed at runtime by
38384 // adding in a register or some sort of table lookup. These can't
38385 // be used as immediates.
38386 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
38389 // If we are in non-pic codegen mode, we allow the address of a global (with
38390 // an optional displacement) to be used with 'i'.
38391 GlobalAddressSDNode *GA = nullptr;
38392 int64_t Offset = 0;
38394 // Match either (GA), (GA+C), (GA+C1+C2), etc.
38396 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
38397 Offset += GA->getOffset();
38399 } else if (Op.getOpcode() == ISD::ADD) {
38400 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
38401 Offset += C->getZExtValue();
38402 Op = Op.getOperand(0);
38405 } else if (Op.getOpcode() == ISD::SUB) {
38406 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
38407 Offset += -C->getZExtValue();
38408 Op = Op.getOperand(0);
38413 // Otherwise, this isn't something we can handle, reject it.
38417 const GlobalValue *GV = GA->getGlobal();
38418 // If we require an extra load to get this address, as in PIC mode, we
38419 // can't accept it.
38420 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
38423 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
38424 GA->getValueType(0), Offset);
38429 if (Result.getNode()) {
38430 Ops.push_back(Result);
38433 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
38436 /// Check if \p RC is a general purpose register class.
38437 /// I.e., GR* or one of their variant.
38438 static bool isGRClass(const TargetRegisterClass &RC) {
38439 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
38440 RC.hasSuperClassEq(&X86::GR16RegClass) ||
38441 RC.hasSuperClassEq(&X86::GR32RegClass) ||
38442 RC.hasSuperClassEq(&X86::GR64RegClass) ||
38443 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
38446 /// Check if \p RC is a vector register class.
38447 /// I.e., FR* / VR* or one of their variant.
38448 static bool isFRClass(const TargetRegisterClass &RC) {
38449 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
38450 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
38451 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
38452 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
38453 RC.hasSuperClassEq(&X86::VR512RegClass);
38456 std::pair<unsigned, const TargetRegisterClass *>
38457 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
38458 StringRef Constraint,
38460 // First, see if this is a constraint that directly corresponds to an LLVM
38462 if (Constraint.size() == 1) {
38463 // GCC Constraint Letters
38464 switch (Constraint[0]) {
38466 // TODO: Slight differences here in allocation order and leaving
38467 // RIP in the class. Do they matter any more here than they do
38468 // in the normal allocation?
38470 if (Subtarget.hasAVX512()) {
38471 // Only supported in AVX512 or later.
38472 switch (VT.SimpleTy) {
38475 return std::make_pair(0U, &X86::VK32RegClass);
38477 return std::make_pair(0U, &X86::VK16RegClass);
38479 return std::make_pair(0U, &X86::VK8RegClass);
38481 return std::make_pair(0U, &X86::VK1RegClass);
38483 return std::make_pair(0U, &X86::VK64RegClass);
38487 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
38488 if (Subtarget.is64Bit()) {
38489 if (VT == MVT::i32 || VT == MVT::f32)
38490 return std::make_pair(0U, &X86::GR32RegClass);
38491 if (VT == MVT::i16)
38492 return std::make_pair(0U, &X86::GR16RegClass);
38493 if (VT == MVT::i8 || VT == MVT::i1)
38494 return std::make_pair(0U, &X86::GR8RegClass);
38495 if (VT == MVT::i64 || VT == MVT::f64)
38496 return std::make_pair(0U, &X86::GR64RegClass);
38500 // 32-bit fallthrough
38501 case 'Q': // Q_REGS
38502 if (VT == MVT::i32 || VT == MVT::f32)
38503 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
38504 if (VT == MVT::i16)
38505 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
38506 if (VT == MVT::i8 || VT == MVT::i1)
38507 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
38508 if (VT == MVT::i64)
38509 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
38511 case 'r': // GENERAL_REGS
38512 case 'l': // INDEX_REGS
38513 if (VT == MVT::i8 || VT == MVT::i1)
38514 return std::make_pair(0U, &X86::GR8RegClass);
38515 if (VT == MVT::i16)
38516 return std::make_pair(0U, &X86::GR16RegClass);
38517 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
38518 return std::make_pair(0U, &X86::GR32RegClass);
38519 return std::make_pair(0U, &X86::GR64RegClass);
38520 case 'R': // LEGACY_REGS
38521 if (VT == MVT::i8 || VT == MVT::i1)
38522 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
38523 if (VT == MVT::i16)
38524 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
38525 if (VT == MVT::i32 || !Subtarget.is64Bit())
38526 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
38527 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
38528 case 'f': // FP Stack registers.
38529 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
38530 // value to the correct fpstack register class.
38531 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
38532 return std::make_pair(0U, &X86::RFP32RegClass);
38533 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
38534 return std::make_pair(0U, &X86::RFP64RegClass);
38535 return std::make_pair(0U, &X86::RFP80RegClass);
38536 case 'y': // MMX_REGS if MMX allowed.
38537 if (!Subtarget.hasMMX()) break;
38538 return std::make_pair(0U, &X86::VR64RegClass);
38539 case 'Y': // SSE_REGS if SSE2 allowed
38540 if (!Subtarget.hasSSE2()) break;
38543 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
38544 if (!Subtarget.hasSSE1()) break;
38545 bool VConstraint = (Constraint[0] == 'v');
38547 switch (VT.SimpleTy) {
38549 // Scalar SSE types.
38552 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
38553 return std::make_pair(0U, &X86::FR32XRegClass);
38554 return std::make_pair(0U, &X86::FR32RegClass);
38557 if (VConstraint && Subtarget.hasVLX())
38558 return std::make_pair(0U, &X86::FR64XRegClass);
38559 return std::make_pair(0U, &X86::FR64RegClass);
38560 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
38568 if (VConstraint && Subtarget.hasVLX())
38569 return std::make_pair(0U, &X86::VR128XRegClass);
38570 return std::make_pair(0U, &X86::VR128RegClass);
38578 if (VConstraint && Subtarget.hasVLX())
38579 return std::make_pair(0U, &X86::VR256XRegClass);
38580 return std::make_pair(0U, &X86::VR256RegClass);
38585 return std::make_pair(0U, &X86::VR512RegClass);
38589 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
38590 switch (Constraint[1]) {
38596 return getRegForInlineAsmConstraint(TRI, "Y", VT);
38598 if (!Subtarget.hasMMX()) break;
38599 return std::make_pair(0U, &X86::VR64RegClass);
38602 if (!Subtarget.hasSSE1()) break;
38603 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
38605 // This register class doesn't allocate k0 for masked vector operation.
38606 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
38607 switch (VT.SimpleTy) {
38610 return std::make_pair(0U, &X86::VK32WMRegClass);
38612 return std::make_pair(0U, &X86::VK16WMRegClass);
38614 return std::make_pair(0U, &X86::VK8WMRegClass);
38616 return std::make_pair(0U, &X86::VK1WMRegClass);
38618 return std::make_pair(0U, &X86::VK64WMRegClass);
38625 // Use the default implementation in TargetLowering to convert the register
38626 // constraint into a member of a register class.
38627 std::pair<unsigned, const TargetRegisterClass*> Res;
38628 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
38630 // Not found as a standard register?
38632 // Map st(0) -> st(7) -> ST0
38633 if (Constraint.size() == 7 && Constraint[0] == '{' &&
38634 tolower(Constraint[1]) == 's' &&
38635 tolower(Constraint[2]) == 't' &&
38636 Constraint[3] == '(' &&
38637 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
38638 Constraint[5] == ')' &&
38639 Constraint[6] == '}') {
38641 Res.first = X86::FP0+Constraint[4]-'0';
38642 Res.second = &X86::RFP80RegClass;
38646 // GCC allows "st(0)" to be called just plain "st".
38647 if (StringRef("{st}").equals_lower(Constraint)) {
38648 Res.first = X86::FP0;
38649 Res.second = &X86::RFP80RegClass;
38654 if (StringRef("{flags}").equals_lower(Constraint)) {
38655 Res.first = X86::EFLAGS;
38656 Res.second = &X86::CCRRegClass;
38660 // 'A' means [ER]AX + [ER]DX.
38661 if (Constraint == "A") {
38662 if (Subtarget.is64Bit()) {
38663 Res.first = X86::RAX;
38664 Res.second = &X86::GR64_ADRegClass;
38666 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
38667 "Expecting 64, 32 or 16 bit subtarget");
38668 Res.first = X86::EAX;
38669 Res.second = &X86::GR32_ADRegClass;
38676 // Otherwise, check to see if this is a register class of the wrong value
38677 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
38678 // turn into {ax},{dx}.
38679 // MVT::Other is used to specify clobber names.
38680 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
38681 return Res; // Correct type already, nothing to do.
38683 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
38684 // return "eax". This should even work for things like getting 64bit integer
38685 // registers when given an f64 type.
38686 const TargetRegisterClass *Class = Res.second;
38687 // The generic code will match the first register class that contains the
38688 // given register. Thus, based on the ordering of the tablegened file,
38689 // the "plain" GR classes might not come first.
38690 // Therefore, use a helper method.
38691 if (isGRClass(*Class)) {
38692 unsigned Size = VT.getSizeInBits();
38693 if (Size == 1) Size = 8;
38694 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
38696 bool is64Bit = Subtarget.is64Bit();
38697 const TargetRegisterClass *RC =
38698 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
38699 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
38700 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
38701 : &X86::GR64RegClass;
38702 if (RC->contains(DestReg))
38703 Res = std::make_pair(DestReg, RC);
38705 // No register found/type mismatch.
38707 Res.second = nullptr;
38709 } else if (isFRClass(*Class)) {
38710 // Handle references to XMM physical registers that got mapped into the
38711 // wrong class. This can happen with constraints like {xmm0} where the
38712 // target independent register mapper will just pick the first match it can
38713 // find, ignoring the required type.
38715 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
38716 if (VT == MVT::f32 || VT == MVT::i32)
38717 Res.second = &X86::FR32RegClass;
38718 else if (VT == MVT::f64 || VT == MVT::i64)
38719 Res.second = &X86::FR64RegClass;
38720 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
38721 Res.second = &X86::VR128RegClass;
38722 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
38723 Res.second = &X86::VR256RegClass;
38724 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
38725 Res.second = &X86::VR512RegClass;
38727 // Type mismatch and not a clobber: Return an error;
38729 Res.second = nullptr;
38736 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
38737 const AddrMode &AM, Type *Ty,
38738 unsigned AS) const {
38739 // Scaling factors are not free at all.
38740 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
38741 // will take 2 allocations in the out of order engine instead of 1
38742 // for plain addressing mode, i.e. inst (reg1).
38744 // vaddps (%rsi,%drx), %ymm0, %ymm1
38745 // Requires two allocations (one for the load, one for the computation)
38747 // vaddps (%rsi), %ymm0, %ymm1
38748 // Requires just 1 allocation, i.e., freeing allocations for other operations
38749 // and having less micro operations to execute.
38751 // For some X86 architectures, this is even worse because for instance for
38752 // stores, the complex addressing mode forces the instruction to use the
38753 // "load" ports instead of the dedicated "store" port.
38754 // E.g., on Haswell:
38755 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
38756 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
38757 if (isLegalAddressingMode(DL, AM, Ty, AS))
38758 // Scale represents reg2 * scale, thus account for 1
38759 // as soon as we use a second register.
38760 return AM.Scale != 0;
38764 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
38765 // Integer division on x86 is expensive. However, when aggressively optimizing
38766 // for code size, we prefer to use a div instruction, as it is usually smaller
38767 // than the alternative sequence.
38768 // The exception to this is vector division. Since x86 doesn't have vector
38769 // integer division, leaving the division as-is is a loss even in terms of
38770 // size, because it will have to be scalarized, while the alternative code
38771 // sequence can be performed in vector form.
38773 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
38774 return OptSize && !VT.isVector();
38777 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
38778 if (!Subtarget.is64Bit())
38781 // Update IsSplitCSR in X86MachineFunctionInfo.
38782 X86MachineFunctionInfo *AFI =
38783 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
38784 AFI->setIsSplitCSR(true);
38787 void X86TargetLowering::insertCopiesSplitCSR(
38788 MachineBasicBlock *Entry,
38789 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
38790 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38791 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
38795 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
38796 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
38797 MachineBasicBlock::iterator MBBI = Entry->begin();
38798 for (const MCPhysReg *I = IStart; *I; ++I) {
38799 const TargetRegisterClass *RC = nullptr;
38800 if (X86::GR64RegClass.contains(*I))
38801 RC = &X86::GR64RegClass;
38803 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
38805 unsigned NewVR = MRI->createVirtualRegister(RC);
38806 // Create copy from CSR to a virtual register.
38807 // FIXME: this currently does not emit CFI pseudo-instructions, it works
38808 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
38809 // nounwind. If we want to generalize this later, we may need to emit
38810 // CFI pseudo-instructions.
38811 assert(Entry->getParent()->getFunction().hasFnAttribute(
38812 Attribute::NoUnwind) &&
38813 "Function should be nounwind in insertCopiesSplitCSR!");
38814 Entry->addLiveIn(*I);
38815 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
38818 // Insert the copy-back instructions right before the terminator.
38819 for (auto *Exit : Exits)
38820 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
38821 TII->get(TargetOpcode::COPY), *I)
38826 bool X86TargetLowering::supportSwiftError() const {
38827 return Subtarget.is64Bit();
38830 /// Returns the name of the symbol used to emit stack probes or the empty
38831 /// string if not applicable.
38832 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
38833 // If the function specifically requests stack probes, emit them.
38834 if (MF.getFunction().hasFnAttribute("probe-stack"))
38835 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
38837 // Generally, if we aren't on Windows, the platform ABI does not include
38838 // support for stack probes, so don't emit them.
38839 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
38842 // We need a stack probe to conform to the Windows ABI. Choose the right
38844 if (Subtarget.is64Bit())
38845 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
38846 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";