1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/DiagnosticInfo.h"
44 #include "llvm/IR/Function.h"
45 #include "llvm/IR/GlobalAlias.h"
46 #include "llvm/IR/GlobalVariable.h"
47 #include "llvm/IR/Instructions.h"
48 #include "llvm/IR/Intrinsics.h"
49 #include "llvm/MC/MCAsmInfo.h"
50 #include "llvm/MC/MCContext.h"
51 #include "llvm/MC/MCExpr.h"
52 #include "llvm/MC/MCSymbol.h"
53 #include "llvm/Support/CommandLine.h"
54 #include "llvm/Support/Debug.h"
55 #include "llvm/Support/ErrorHandling.h"
56 #include "llvm/Support/KnownBits.h"
57 #include "llvm/Support/MathExtras.h"
58 #include "llvm/Target/TargetLowering.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
191 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
193 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
194 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
195 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
197 if (Subtarget.is64Bit()) {
198 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
199 // f32/f64 are legal, f80 is custom.
200 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
204 } else if (!Subtarget.useSoftFloat()) {
205 // We have an algorithm for SSE2->double, and we turn this into a
206 // 64-bit FILD followed by conditional FADD for other targets.
207 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
208 // We have an algorithm for SSE2, and we turn this into a 64-bit
209 // FILD or VCVTUSI2SS/SD for other targets.
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
213 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
215 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
216 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
218 if (!Subtarget.useSoftFloat()) {
219 // SSE has no i16 to fp conversion, only i32.
220 if (X86ScalarSSEf32) {
221 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
222 // f32 and f64 cases are Legal, f80 case is not
223 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
225 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
226 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
229 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
230 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
233 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
235 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
236 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
238 if (!Subtarget.useSoftFloat()) {
239 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
240 // are Legal, f80 is custom lowered.
241 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
242 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
244 if (X86ScalarSSEf32) {
245 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
246 // f32 and f64 cases are Legal, f80 case is not
247 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
249 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
250 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
253 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
254 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
255 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
258 // Handle FP_TO_UINT by promoting the destination to a larger signed
260 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
261 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
262 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
264 if (Subtarget.is64Bit()) {
265 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
266 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
267 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
268 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
270 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
271 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
273 } else if (!Subtarget.useSoftFloat()) {
274 // Since AVX is a superset of SSE3, only check for SSE here.
275 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
276 // Expand FP_TO_UINT into a select.
277 // FIXME: We would like to use a Custom expander here eventually to do
278 // the optimal thing for SSE vs. the default expansion in the legalizer.
279 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
281 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
282 // With SSE3 we can use fisttpll to convert to a signed i64; without
283 // SSE, we're stuck with a fistpll.
284 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
286 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
289 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
290 if (!X86ScalarSSEf64) {
291 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
292 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
293 if (Subtarget.is64Bit()) {
294 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
295 // Without SSE, i64->f64 goes through memory.
296 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
298 } else if (!Subtarget.is64Bit())
299 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
301 // Scalar integer divide and remainder are lowered to use operations that
302 // produce two results, to match the available instructions. This exposes
303 // the two-result form to trivial CSE, which is able to combine x/y and x%y
304 // into a single instruction.
306 // Scalar integer multiply-high is also lowered to use two-result
307 // operations, to match the available instructions. However, plain multiply
308 // (low) operations are left as Legal, as there are single-result
309 // instructions for this in x86. Using the two-result multiply instructions
310 // when both high and low results are needed must be arranged by dagcombine.
311 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
312 setOperationAction(ISD::MULHS, VT, Expand);
313 setOperationAction(ISD::MULHU, VT, Expand);
314 setOperationAction(ISD::SDIV, VT, Expand);
315 setOperationAction(ISD::UDIV, VT, Expand);
316 setOperationAction(ISD::SREM, VT, Expand);
317 setOperationAction(ISD::UREM, VT, Expand);
320 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
321 if (VT == MVT::i64 && !Subtarget.is64Bit())
323 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
324 setOperationAction(ISD::ADDC, VT, Custom);
325 setOperationAction(ISD::ADDE, VT, Custom);
326 setOperationAction(ISD::SUBC, VT, Custom);
327 setOperationAction(ISD::SUBE, VT, Custom);
330 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
331 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
332 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
333 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
334 setOperationAction(ISD::BR_CC, VT, Expand);
335 setOperationAction(ISD::SELECT_CC, VT, Expand);
337 if (Subtarget.is64Bit())
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
340 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
341 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
342 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
344 setOperationAction(ISD::FREM , MVT::f32 , Expand);
345 setOperationAction(ISD::FREM , MVT::f64 , Expand);
346 setOperationAction(ISD::FREM , MVT::f80 , Expand);
347 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
349 // Promote the i8 variants and force them on up to i32 which has a shorter
351 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
352 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
353 if (!Subtarget.hasBMI()) {
354 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
355 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
356 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
357 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
358 if (Subtarget.is64Bit()) {
359 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
360 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
364 if (Subtarget.hasLZCNT()) {
365 // When promoting the i8 variants, force them to i32 for a shorter
367 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
368 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
370 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
371 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
372 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
374 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
375 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
376 if (Subtarget.is64Bit()) {
377 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
378 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
382 // Special handling for half-precision floating point conversions.
383 // If we don't have F16C support, then lower half float conversions
384 // into library calls.
385 if (Subtarget.useSoftFloat() ||
386 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
387 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
388 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
391 // There's never any support for operations beyond MVT::f32.
392 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
393 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
394 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
395 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
397 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
398 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
399 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
400 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
401 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
402 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
404 if (Subtarget.hasPOPCNT()) {
405 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
407 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
408 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
409 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
410 if (Subtarget.is64Bit())
411 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
414 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
416 if (!Subtarget.hasMOVBE())
417 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
419 // These should be promoted to a larger select which is supported.
420 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
421 // X86 wants to expand cmov itself.
422 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
423 setOperationAction(ISD::SELECT, VT, Custom);
424 setOperationAction(ISD::SETCC, VT, Custom);
426 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
427 if (VT == MVT::i64 && !Subtarget.is64Bit())
429 setOperationAction(ISD::SELECT, VT, Custom);
430 setOperationAction(ISD::SETCC, VT, Custom);
431 setOperationAction(ISD::SETCCE, VT, Custom);
433 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
434 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
435 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
436 // support continuation, user-level threading, and etc.. As a result, no
437 // other SjLj exception interfaces are implemented and please don't build
438 // your own exception handling based on them.
439 // LLVM/Clang supports zero-cost DWARF exception handling.
440 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
441 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
442 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
443 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
444 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
447 for (auto VT : { MVT::i32, MVT::i64 }) {
448 if (VT == MVT::i64 && !Subtarget.is64Bit())
450 setOperationAction(ISD::ConstantPool , VT, Custom);
451 setOperationAction(ISD::JumpTable , VT, Custom);
452 setOperationAction(ISD::GlobalAddress , VT, Custom);
453 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
454 setOperationAction(ISD::ExternalSymbol , VT, Custom);
455 setOperationAction(ISD::BlockAddress , VT, Custom);
458 // 64-bit shl, sra, srl (iff 32-bit x86)
459 for (auto VT : { MVT::i32, MVT::i64 }) {
460 if (VT == MVT::i64 && !Subtarget.is64Bit())
462 setOperationAction(ISD::SHL_PARTS, VT, Custom);
463 setOperationAction(ISD::SRA_PARTS, VT, Custom);
464 setOperationAction(ISD::SRL_PARTS, VT, Custom);
467 if (Subtarget.hasSSE1())
468 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
470 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
472 // Expand certain atomics
473 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
474 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
477 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
478 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
479 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
480 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
483 if (Subtarget.hasCmpxchg16b()) {
484 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
487 // FIXME - use subtarget debug flags
488 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
489 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
490 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
491 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
494 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
495 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
497 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
498 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
500 setOperationAction(ISD::TRAP, MVT::Other, Legal);
501 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
503 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
504 setOperationAction(ISD::VASTART , MVT::Other, Custom);
505 setOperationAction(ISD::VAEND , MVT::Other, Expand);
506 bool Is64Bit = Subtarget.is64Bit();
507 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
508 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
510 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
511 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
513 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
515 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
516 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
517 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
519 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
520 // f32 and f64 use SSE.
521 // Set up the FP register classes.
522 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
523 : &X86::FR32RegClass);
524 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
525 : &X86::FR64RegClass);
527 for (auto VT : { MVT::f32, MVT::f64 }) {
528 // Use ANDPD to simulate FABS.
529 setOperationAction(ISD::FABS, VT, Custom);
531 // Use XORP to simulate FNEG.
532 setOperationAction(ISD::FNEG, VT, Custom);
534 // Use ANDPD and ORPD to simulate FCOPYSIGN.
535 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
537 // We don't support sin/cos/fmod
538 setOperationAction(ISD::FSIN , VT, Expand);
539 setOperationAction(ISD::FCOS , VT, Expand);
540 setOperationAction(ISD::FSINCOS, VT, Expand);
543 // Lower this to MOVMSK plus an AND.
544 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
545 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
547 // Expand FP immediates into loads from the stack, except for the special
549 addLegalFPImmediate(APFloat(+0.0)); // xorpd
550 addLegalFPImmediate(APFloat(+0.0f)); // xorps
551 } else if (UseX87 && X86ScalarSSEf32) {
552 // Use SSE for f32, x87 for f64.
553 // Set up the FP register classes.
554 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
555 : &X86::FR32RegClass);
556 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
558 // Use ANDPS to simulate FABS.
559 setOperationAction(ISD::FABS , MVT::f32, Custom);
561 // Use XORP to simulate FNEG.
562 setOperationAction(ISD::FNEG , MVT::f32, Custom);
564 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
566 // Use ANDPS and ORPS to simulate FCOPYSIGN.
567 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
568 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
570 // We don't support sin/cos/fmod
571 setOperationAction(ISD::FSIN , MVT::f32, Expand);
572 setOperationAction(ISD::FCOS , MVT::f32, Expand);
573 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
575 // Special cases we handle for FP constants.
576 addLegalFPImmediate(APFloat(+0.0f)); // xorps
577 addLegalFPImmediate(APFloat(+0.0)); // FLD0
578 addLegalFPImmediate(APFloat(+1.0)); // FLD1
579 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
580 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
582 if (!TM.Options.UnsafeFPMath) {
583 setOperationAction(ISD::FSIN , MVT::f64, Expand);
584 setOperationAction(ISD::FCOS , MVT::f64, Expand);
585 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
588 // f32 and f64 in x87.
589 // Set up the FP register classes.
590 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
591 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
593 for (auto VT : { MVT::f32, MVT::f64 }) {
594 setOperationAction(ISD::UNDEF, VT, Expand);
595 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
597 if (!TM.Options.UnsafeFPMath) {
598 setOperationAction(ISD::FSIN , VT, Expand);
599 setOperationAction(ISD::FCOS , VT, Expand);
600 setOperationAction(ISD::FSINCOS, VT, Expand);
603 addLegalFPImmediate(APFloat(+0.0)); // FLD0
604 addLegalFPImmediate(APFloat(+1.0)); // FLD1
605 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
606 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
607 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
608 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
609 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
610 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
613 // We don't support FMA.
614 setOperationAction(ISD::FMA, MVT::f64, Expand);
615 setOperationAction(ISD::FMA, MVT::f32, Expand);
617 // Long double always uses X87, except f128 in MMX.
619 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
620 addRegisterClass(MVT::f128, &X86::FR128RegClass);
621 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
622 setOperationAction(ISD::FABS , MVT::f128, Custom);
623 setOperationAction(ISD::FNEG , MVT::f128, Custom);
624 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
627 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
628 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
629 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
631 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
632 addLegalFPImmediate(TmpFlt); // FLD0
634 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
637 APFloat TmpFlt2(+1.0);
638 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
640 addLegalFPImmediate(TmpFlt2); // FLD1
641 TmpFlt2.changeSign();
642 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
645 if (!TM.Options.UnsafeFPMath) {
646 setOperationAction(ISD::FSIN , MVT::f80, Expand);
647 setOperationAction(ISD::FCOS , MVT::f80, Expand);
648 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
651 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
652 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
653 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
654 setOperationAction(ISD::FRINT, MVT::f80, Expand);
655 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
656 setOperationAction(ISD::FMA, MVT::f80, Expand);
659 // Always use a library call for pow.
660 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
661 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
662 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
664 setOperationAction(ISD::FLOG, MVT::f80, Expand);
665 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
666 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
667 setOperationAction(ISD::FEXP, MVT::f80, Expand);
668 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
669 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
670 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
672 // Some FP actions are always expanded for vector types.
673 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
674 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
675 setOperationAction(ISD::FSIN, VT, Expand);
676 setOperationAction(ISD::FSINCOS, VT, Expand);
677 setOperationAction(ISD::FCOS, VT, Expand);
678 setOperationAction(ISD::FREM, VT, Expand);
679 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
680 setOperationAction(ISD::FPOW, VT, Expand);
681 setOperationAction(ISD::FLOG, VT, Expand);
682 setOperationAction(ISD::FLOG2, VT, Expand);
683 setOperationAction(ISD::FLOG10, VT, Expand);
684 setOperationAction(ISD::FEXP, VT, Expand);
685 setOperationAction(ISD::FEXP2, VT, Expand);
688 // First set operation action for all vector types to either promote
689 // (for widening) or expand (for scalarization). Then we will selectively
690 // turn on ones that can be effectively codegen'd.
691 for (MVT VT : MVT::vector_valuetypes()) {
692 setOperationAction(ISD::SDIV, VT, Expand);
693 setOperationAction(ISD::UDIV, VT, Expand);
694 setOperationAction(ISD::SREM, VT, Expand);
695 setOperationAction(ISD::UREM, VT, Expand);
696 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
697 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
698 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
699 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
700 setOperationAction(ISD::FMA, VT, Expand);
701 setOperationAction(ISD::FFLOOR, VT, Expand);
702 setOperationAction(ISD::FCEIL, VT, Expand);
703 setOperationAction(ISD::FTRUNC, VT, Expand);
704 setOperationAction(ISD::FRINT, VT, Expand);
705 setOperationAction(ISD::FNEARBYINT, VT, Expand);
706 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
707 setOperationAction(ISD::MULHS, VT, Expand);
708 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
709 setOperationAction(ISD::MULHU, VT, Expand);
710 setOperationAction(ISD::SDIVREM, VT, Expand);
711 setOperationAction(ISD::UDIVREM, VT, Expand);
712 setOperationAction(ISD::CTPOP, VT, Expand);
713 setOperationAction(ISD::CTTZ, VT, Expand);
714 setOperationAction(ISD::CTLZ, VT, Expand);
715 setOperationAction(ISD::ROTL, VT, Expand);
716 setOperationAction(ISD::ROTR, VT, Expand);
717 setOperationAction(ISD::BSWAP, VT, Expand);
718 setOperationAction(ISD::SETCC, VT, Expand);
719 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
720 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
721 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
722 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
723 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
724 setOperationAction(ISD::TRUNCATE, VT, Expand);
725 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
726 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
727 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
728 setOperationAction(ISD::SELECT_CC, VT, Expand);
729 for (MVT InnerVT : MVT::vector_valuetypes()) {
730 setTruncStoreAction(InnerVT, VT, Expand);
732 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
733 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
735 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
736 // types, we have to deal with them whether we ask for Expansion or not.
737 // Setting Expand causes its own optimisation problems though, so leave
739 if (VT.getVectorElementType() == MVT::i1)
740 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
742 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
743 // split/scalarized right now.
744 if (VT.getVectorElementType() == MVT::f16)
745 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
749 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
750 // with -msoft-float, disable use of MMX as well.
751 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
752 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
753 // No operations on x86mmx supported, everything uses intrinsics.
756 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
757 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
758 : &X86::VR128RegClass);
760 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
761 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
762 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
763 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
764 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
765 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
766 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
767 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
768 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
771 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
772 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
775 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
776 // registers cannot be used even for integer operations.
777 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
778 : &X86::VR128RegClass);
779 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
780 : &X86::VR128RegClass);
781 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
782 : &X86::VR128RegClass);
783 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
784 : &X86::VR128RegClass);
786 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
787 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
788 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
789 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
790 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
791 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
792 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
793 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
794 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
795 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
796 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
797 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
798 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
800 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
801 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
802 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
803 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
805 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
806 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
807 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
809 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
810 setOperationAction(ISD::SETCC, VT, Custom);
811 setOperationAction(ISD::CTPOP, VT, Custom);
812 setOperationAction(ISD::CTTZ, VT, Custom);
815 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
816 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
817 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
818 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
819 setOperationAction(ISD::VSELECT, VT, Custom);
820 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
823 // We support custom legalizing of sext and anyext loads for specific
824 // memory vector types which we can load as a scalar (or sequence of
825 // scalars) and extend in-register to a legal 128-bit vector type. For sext
826 // loads these must work with a single scalar load.
827 for (MVT VT : MVT::integer_vector_valuetypes()) {
828 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
829 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
830 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
831 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
832 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
833 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
834 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
835 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
836 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
839 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
840 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
841 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
842 setOperationAction(ISD::VSELECT, VT, Custom);
844 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
847 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
848 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
851 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
852 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
853 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
854 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
855 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
856 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
857 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
860 // Custom lower v2i64 and v2f64 selects.
861 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
862 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
864 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
865 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
867 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
868 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
870 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
871 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
872 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
874 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
875 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
877 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
878 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
880 for (MVT VT : MVT::fp_vector_valuetypes())
881 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
883 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
884 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
885 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
887 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
888 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
889 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
891 // In the customized shift lowering, the legal v4i32/v2i64 cases
892 // in AVX2 will be recognized.
893 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
894 setOperationAction(ISD::SRL, VT, Custom);
895 setOperationAction(ISD::SHL, VT, Custom);
896 setOperationAction(ISD::SRA, VT, Custom);
900 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
901 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
902 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
903 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
904 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
905 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
906 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
907 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
908 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
911 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
912 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
913 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
914 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
915 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
916 setOperationAction(ISD::FRINT, RoundedTy, Legal);
917 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
920 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
921 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
922 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
923 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
924 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
925 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
926 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
927 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
929 // FIXME: Do we need to handle scalar-to-vector here?
930 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
932 // We directly match byte blends in the backend as they match the VSELECT
934 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
936 // SSE41 brings specific instructions for doing vector sign extend even in
937 // cases where we don't have SRA.
938 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
939 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
940 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
943 for (MVT VT : MVT::integer_vector_valuetypes()) {
944 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
945 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
946 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
949 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
950 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
951 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
952 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
953 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
954 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
955 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
956 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
959 // i8 vectors are custom because the source register and source
960 // source memory operand types are not the same width.
961 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
964 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
965 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
966 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
967 setOperationAction(ISD::ROTL, VT, Custom);
969 // XOP can efficiently perform BITREVERSE with VPPERM.
970 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
971 setOperationAction(ISD::BITREVERSE, VT, Custom);
973 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
974 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
975 setOperationAction(ISD::BITREVERSE, VT, Custom);
978 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
979 bool HasInt256 = Subtarget.hasInt256();
981 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
982 : &X86::VR256RegClass);
983 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
984 : &X86::VR256RegClass);
985 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
986 : &X86::VR256RegClass);
987 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
988 : &X86::VR256RegClass);
989 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
990 : &X86::VR256RegClass);
991 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
992 : &X86::VR256RegClass);
994 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
995 setOperationAction(ISD::FFLOOR, VT, Legal);
996 setOperationAction(ISD::FCEIL, VT, Legal);
997 setOperationAction(ISD::FTRUNC, VT, Legal);
998 setOperationAction(ISD::FRINT, VT, Legal);
999 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1000 setOperationAction(ISD::FNEG, VT, Custom);
1001 setOperationAction(ISD::FABS, VT, Custom);
1002 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1005 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1006 // even though v8i16 is a legal type.
1007 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
1008 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
1009 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1011 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
1012 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1013 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1015 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1016 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1018 for (MVT VT : MVT::fp_vector_valuetypes())
1019 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1021 // In the customized shift lowering, the legal v8i32/v4i64 cases
1022 // in AVX2 will be recognized.
1023 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1024 setOperationAction(ISD::SRL, VT, Custom);
1025 setOperationAction(ISD::SHL, VT, Custom);
1026 setOperationAction(ISD::SRA, VT, Custom);
1029 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1030 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1031 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1033 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1034 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1035 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1036 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1039 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1040 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1041 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1042 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1044 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1045 setOperationAction(ISD::SETCC, VT, Custom);
1046 setOperationAction(ISD::CTPOP, VT, Custom);
1047 setOperationAction(ISD::CTTZ, VT, Custom);
1048 setOperationAction(ISD::CTLZ, VT, Custom);
1051 if (Subtarget.hasAnyFMA()) {
1052 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1053 MVT::v2f64, MVT::v4f64 })
1054 setOperationAction(ISD::FMA, VT, Legal);
1057 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1058 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1059 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1062 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1063 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1064 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1065 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1067 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1068 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1070 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1071 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1072 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1073 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1075 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1076 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1077 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1078 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1079 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1080 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1084 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1085 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1086 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1088 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1089 // when we have a 256bit-wide blend with immediate.
1090 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1092 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1093 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1094 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1095 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1096 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1097 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1098 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1099 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1103 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1104 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1105 setOperationAction(ISD::MLOAD, VT, Legal);
1106 setOperationAction(ISD::MSTORE, VT, Legal);
1109 // Extract subvector is special because the value type
1110 // (result) is 128-bit but the source is 256-bit wide.
1111 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1112 MVT::v4f32, MVT::v2f64 }) {
1113 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1116 // Custom lower several nodes for 256-bit types.
1117 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1118 MVT::v8f32, MVT::v4f64 }) {
1119 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1120 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1121 setOperationAction(ISD::VSELECT, VT, Custom);
1122 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1123 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1124 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1125 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1126 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1130 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1132 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1133 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1134 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1135 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1136 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1137 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1138 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1142 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1143 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1144 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1145 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1146 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1148 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1149 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1150 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1152 for (MVT VT : MVT::fp_vector_valuetypes())
1153 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1155 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1156 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1157 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1158 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1159 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1160 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1161 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1164 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1165 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1166 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1167 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1168 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1169 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1170 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1171 setTruncStoreAction(VT, MaskVT, Custom);
1174 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1175 setOperationAction(ISD::FNEG, VT, Custom);
1176 setOperationAction(ISD::FABS, VT, Custom);
1177 setOperationAction(ISD::FMA, VT, Legal);
1178 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1181 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1182 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1183 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1184 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1185 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1186 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1187 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1188 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1189 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1190 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1191 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1192 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1193 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1194 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1195 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1196 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1197 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1198 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1199 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1200 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1201 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1202 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1203 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1204 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1205 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1207 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1208 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1209 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1210 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1211 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1212 if (Subtarget.hasVLX()){
1213 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1214 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1215 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1216 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1217 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1219 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1220 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1221 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1222 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1223 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1225 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1226 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1227 setOperationAction(ISD::MLOAD, VT, Custom);
1228 setOperationAction(ISD::MSTORE, VT, Custom);
1231 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1232 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1234 if (Subtarget.hasDQI()) {
1235 for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1236 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1237 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1238 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1239 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1241 if (Subtarget.hasVLX()) {
1242 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1243 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1244 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1245 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1248 if (Subtarget.hasVLX()) {
1249 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1250 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1251 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1252 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1253 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1254 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1255 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1256 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1257 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1258 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1259 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1261 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1262 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1263 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1264 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1265 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1266 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1267 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1268 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1269 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1270 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1271 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1274 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1275 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1276 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1277 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1278 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1279 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1280 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1281 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1282 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1283 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1285 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1286 setOperationAction(ISD::FFLOOR, VT, Legal);
1287 setOperationAction(ISD::FCEIL, VT, Legal);
1288 setOperationAction(ISD::FTRUNC, VT, Legal);
1289 setOperationAction(ISD::FRINT, VT, Legal);
1290 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1293 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1294 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1296 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1297 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1298 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1300 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1301 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1302 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1303 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1304 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1306 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1308 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1309 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1310 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1311 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1312 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1313 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1315 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1317 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1318 setOperationAction(ISD::ABS, MVT::v4i64, Legal);
1319 setOperationAction(ISD::ABS, MVT::v2i64, Legal);
1321 for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1322 setOperationAction(ISD::ADD, VT, Custom);
1323 setOperationAction(ISD::SUB, VT, Custom);
1324 setOperationAction(ISD::MUL, VT, Custom);
1325 setOperationAction(ISD::SETCC, VT, Custom);
1326 setOperationAction(ISD::SELECT, VT, Custom);
1327 setOperationAction(ISD::TRUNCATE, VT, Custom);
1329 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1330 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1331 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1332 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1333 setOperationAction(ISD::VSELECT, VT, Expand);
1336 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1337 setOperationAction(ISD::SMAX, VT, Legal);
1338 setOperationAction(ISD::UMAX, VT, Legal);
1339 setOperationAction(ISD::SMIN, VT, Legal);
1340 setOperationAction(ISD::UMIN, VT, Legal);
1341 setOperationAction(ISD::ABS, VT, Legal);
1342 setOperationAction(ISD::SRL, VT, Custom);
1343 setOperationAction(ISD::SHL, VT, Custom);
1344 setOperationAction(ISD::SRA, VT, Custom);
1345 setOperationAction(ISD::CTPOP, VT, Custom);
1346 setOperationAction(ISD::CTTZ, VT, Custom);
1349 // Need to promote to 64-bit even though we have 32-bit masked instructions
1350 // because the IR optimizers rearrange bitcasts around logic ops leaving
1351 // too many variations to handle if we don't promote them.
1352 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1353 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1354 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1356 if (Subtarget.hasCDI()) {
1357 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1358 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1359 MVT::v4i64, MVT::v8i64}) {
1360 setOperationAction(ISD::CTLZ, VT, Legal);
1361 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1363 } // Subtarget.hasCDI()
1365 if (Subtarget.hasDQI()) {
1366 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1367 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1368 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1369 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1372 if (Subtarget.hasVPOPCNTDQ()) {
1373 // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1374 // version of popcntd/q.
1375 for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1376 MVT::v4i32, MVT::v2i64})
1377 setOperationAction(ISD::CTPOP, VT, Legal);
1380 // Custom lower several nodes.
1381 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1382 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1383 setOperationAction(ISD::MGATHER, VT, Custom);
1384 setOperationAction(ISD::MSCATTER, VT, Custom);
1386 // Extract subvector is special because the value type
1387 // (result) is 256-bit but the source is 512-bit wide.
1388 // 128-bit was made Custom under AVX1.
1389 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1390 MVT::v8f32, MVT::v4f64 })
1391 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1392 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1393 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1394 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1396 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1397 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1398 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1399 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1400 setOperationAction(ISD::VSELECT, VT, Custom);
1401 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1402 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1403 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1404 setOperationAction(ISD::MLOAD, VT, Legal);
1405 setOperationAction(ISD::MSTORE, VT, Legal);
1406 setOperationAction(ISD::MGATHER, VT, Legal);
1407 setOperationAction(ISD::MSCATTER, VT, Custom);
1409 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1410 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1411 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1415 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1416 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1417 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1419 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1420 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1422 setOperationAction(ISD::ADD, MVT::v32i1, Custom);
1423 setOperationAction(ISD::ADD, MVT::v64i1, Custom);
1424 setOperationAction(ISD::SUB, MVT::v32i1, Custom);
1425 setOperationAction(ISD::SUB, MVT::v64i1, Custom);
1426 setOperationAction(ISD::MUL, MVT::v32i1, Custom);
1427 setOperationAction(ISD::MUL, MVT::v64i1, Custom);
1429 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1430 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1431 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1432 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1433 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1434 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1435 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1436 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1437 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1438 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1439 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1440 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1441 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1442 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1443 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1444 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1445 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1446 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1447 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1448 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1449 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1450 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1451 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1452 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1453 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1454 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1455 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1456 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1457 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1458 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1459 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1460 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1461 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1462 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1463 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1464 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1465 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1466 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1467 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1468 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1469 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1470 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1471 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1472 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1473 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1475 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1477 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1478 if (Subtarget.hasVLX()) {
1479 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1480 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1483 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1484 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1485 setOperationAction(ISD::MLOAD, VT, Action);
1486 setOperationAction(ISD::MSTORE, VT, Action);
1489 if (Subtarget.hasCDI()) {
1490 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1491 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1494 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1495 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1496 setOperationAction(ISD::VSELECT, VT, Custom);
1497 setOperationAction(ISD::ABS, VT, Legal);
1498 setOperationAction(ISD::SRL, VT, Custom);
1499 setOperationAction(ISD::SHL, VT, Custom);
1500 setOperationAction(ISD::SRA, VT, Custom);
1501 setOperationAction(ISD::MLOAD, VT, Legal);
1502 setOperationAction(ISD::MSTORE, VT, Legal);
1503 setOperationAction(ISD::CTPOP, VT, Custom);
1504 setOperationAction(ISD::CTTZ, VT, Custom);
1505 setOperationAction(ISD::SMAX, VT, Legal);
1506 setOperationAction(ISD::UMAX, VT, Legal);
1507 setOperationAction(ISD::SMIN, VT, Legal);
1508 setOperationAction(ISD::UMIN, VT, Legal);
1510 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1511 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1512 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1515 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1516 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1517 if (Subtarget.hasVLX()) {
1518 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1519 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1520 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1525 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1526 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1527 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1529 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1530 setOperationAction(ISD::ADD, VT, Custom);
1531 setOperationAction(ISD::SUB, VT, Custom);
1532 setOperationAction(ISD::MUL, VT, Custom);
1533 setOperationAction(ISD::VSELECT, VT, Expand);
1535 setOperationAction(ISD::TRUNCATE, VT, Custom);
1536 setOperationAction(ISD::SETCC, VT, Custom);
1537 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1538 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1539 setOperationAction(ISD::SELECT, VT, Custom);
1540 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1541 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1544 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1545 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1546 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1547 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1549 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1550 setOperationAction(ISD::SMAX, VT, Legal);
1551 setOperationAction(ISD::UMAX, VT, Legal);
1552 setOperationAction(ISD::SMIN, VT, Legal);
1553 setOperationAction(ISD::UMIN, VT, Legal);
1557 // We want to custom lower some of our intrinsics.
1558 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1559 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1560 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1561 if (!Subtarget.is64Bit()) {
1562 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1563 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1566 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1567 // handle type legalization for these operations here.
1569 // FIXME: We really should do custom legalization for addition and
1570 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1571 // than generic legalization for 64-bit multiplication-with-overflow, though.
1572 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1573 if (VT == MVT::i64 && !Subtarget.is64Bit())
1575 // Add/Sub/Mul with overflow operations are custom lowered.
1576 setOperationAction(ISD::SADDO, VT, Custom);
1577 setOperationAction(ISD::UADDO, VT, Custom);
1578 setOperationAction(ISD::SSUBO, VT, Custom);
1579 setOperationAction(ISD::USUBO, VT, Custom);
1580 setOperationAction(ISD::SMULO, VT, Custom);
1581 setOperationAction(ISD::UMULO, VT, Custom);
1583 // Support carry in as value rather than glue.
1584 setOperationAction(ISD::ADDCARRY, VT, Custom);
1585 setOperationAction(ISD::SUBCARRY, VT, Custom);
1588 if (!Subtarget.is64Bit()) {
1589 // These libcalls are not available in 32-bit.
1590 setLibcallName(RTLIB::SHL_I128, nullptr);
1591 setLibcallName(RTLIB::SRL_I128, nullptr);
1592 setLibcallName(RTLIB::SRA_I128, nullptr);
1595 // Combine sin / cos into one node or libcall if possible.
1596 if (Subtarget.hasSinCos()) {
1597 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1598 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1599 if (Subtarget.isTargetDarwin()) {
1600 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1601 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1602 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1603 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1607 if (Subtarget.isTargetWin64()) {
1608 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1609 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1610 setOperationAction(ISD::SREM, MVT::i128, Custom);
1611 setOperationAction(ISD::UREM, MVT::i128, Custom);
1612 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1613 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1616 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1617 // is. We should promote the value to 64-bits to solve this.
1618 // This is what the CRT headers do - `fmodf` is an inline header
1619 // function casting to f64 and calling `fmod`.
1620 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1621 Subtarget.isTargetWindowsItanium()))
1622 for (ISD::NodeType Op :
1623 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1624 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1625 if (isOperationExpand(Op, MVT::f32))
1626 setOperationAction(Op, MVT::f32, Promote);
1628 // We have target-specific dag combine patterns for the following nodes:
1629 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1630 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1631 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1632 setTargetDAGCombine(ISD::BITCAST);
1633 setTargetDAGCombine(ISD::VSELECT);
1634 setTargetDAGCombine(ISD::SELECT);
1635 setTargetDAGCombine(ISD::SHL);
1636 setTargetDAGCombine(ISD::SRA);
1637 setTargetDAGCombine(ISD::SRL);
1638 setTargetDAGCombine(ISD::OR);
1639 setTargetDAGCombine(ISD::AND);
1640 setTargetDAGCombine(ISD::ADD);
1641 setTargetDAGCombine(ISD::FADD);
1642 setTargetDAGCombine(ISD::FSUB);
1643 setTargetDAGCombine(ISD::FNEG);
1644 setTargetDAGCombine(ISD::FMA);
1645 setTargetDAGCombine(ISD::FMINNUM);
1646 setTargetDAGCombine(ISD::FMAXNUM);
1647 setTargetDAGCombine(ISD::SUB);
1648 setTargetDAGCombine(ISD::LOAD);
1649 setTargetDAGCombine(ISD::MLOAD);
1650 setTargetDAGCombine(ISD::STORE);
1651 setTargetDAGCombine(ISD::MSTORE);
1652 setTargetDAGCombine(ISD::TRUNCATE);
1653 setTargetDAGCombine(ISD::ZERO_EXTEND);
1654 setTargetDAGCombine(ISD::ANY_EXTEND);
1655 setTargetDAGCombine(ISD::SIGN_EXTEND);
1656 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1657 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1658 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1659 setTargetDAGCombine(ISD::SINT_TO_FP);
1660 setTargetDAGCombine(ISD::UINT_TO_FP);
1661 setTargetDAGCombine(ISD::SETCC);
1662 setTargetDAGCombine(ISD::MUL);
1663 setTargetDAGCombine(ISD::XOR);
1664 setTargetDAGCombine(ISD::MSCATTER);
1665 setTargetDAGCombine(ISD::MGATHER);
1667 computeRegisterProperties(Subtarget.getRegisterInfo());
1669 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1670 MaxStoresPerMemsetOptSize = 8;
1671 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1672 MaxStoresPerMemcpyOptSize = 4;
1673 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1674 MaxStoresPerMemmoveOptSize = 4;
1675 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1676 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1678 // An out-of-order CPU can speculatively execute past a predictable branch,
1679 // but a conditional move could be stalled by an expensive earlier operation.
1680 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1681 EnableExtLdPromotion = true;
1682 setPrefFunctionAlignment(4); // 2^4 bytes.
1684 verifyIntrinsicTables();
1687 // This has so far only been implemented for 64-bit MachO.
1688 bool X86TargetLowering::useLoadStackGuardNode() const {
1689 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1692 TargetLoweringBase::LegalizeTypeAction
1693 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1694 if (ExperimentalVectorWideningLegalization &&
1695 VT.getVectorNumElements() != 1 &&
1696 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1697 return TypeWidenVector;
1699 return TargetLoweringBase::getPreferredVectorAction(VT);
1702 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1703 LLVMContext& Context,
1708 if (VT.isSimple()) {
1709 MVT VVT = VT.getSimpleVT();
1710 const unsigned NumElts = VVT.getVectorNumElements();
1711 MVT EltVT = VVT.getVectorElementType();
1712 if (VVT.is512BitVector()) {
1713 if (Subtarget.hasAVX512())
1714 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1715 EltVT == MVT::f32 || EltVT == MVT::f64)
1717 case 8: return MVT::v8i1;
1718 case 16: return MVT::v16i1;
1720 if (Subtarget.hasBWI())
1721 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1723 case 32: return MVT::v32i1;
1724 case 64: return MVT::v64i1;
1728 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1729 return MVT::getVectorVT(MVT::i1, NumElts);
1731 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1732 EVT LegalVT = getTypeToTransformTo(Context, VT);
1733 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1736 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1738 case 2: return MVT::v2i1;
1739 case 4: return MVT::v4i1;
1740 case 8: return MVT::v8i1;
1744 return VT.changeVectorElementTypeToInteger();
1747 /// Helper for getByValTypeAlignment to determine
1748 /// the desired ByVal argument alignment.
1749 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1752 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1753 if (VTy->getBitWidth() == 128)
1755 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1756 unsigned EltAlign = 0;
1757 getMaxByValAlign(ATy->getElementType(), EltAlign);
1758 if (EltAlign > MaxAlign)
1759 MaxAlign = EltAlign;
1760 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1761 for (auto *EltTy : STy->elements()) {
1762 unsigned EltAlign = 0;
1763 getMaxByValAlign(EltTy, EltAlign);
1764 if (EltAlign > MaxAlign)
1765 MaxAlign = EltAlign;
1772 /// Return the desired alignment for ByVal aggregate
1773 /// function arguments in the caller parameter area. For X86, aggregates
1774 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1775 /// are at 4-byte boundaries.
1776 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1777 const DataLayout &DL) const {
1778 if (Subtarget.is64Bit()) {
1779 // Max of 8 and alignment of type.
1780 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1787 if (Subtarget.hasSSE1())
1788 getMaxByValAlign(Ty, Align);
1792 /// Returns the target specific optimal type for load
1793 /// and store operations as a result of memset, memcpy, and memmove
1794 /// lowering. If DstAlign is zero that means it's safe to destination
1795 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1796 /// means there isn't a need to check it against alignment requirement,
1797 /// probably because the source does not need to be loaded. If 'IsMemset' is
1798 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1799 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1800 /// source is constant so it does not need to be loaded.
1801 /// It returns EVT::Other if the type should be determined using generic
1802 /// target-independent logic.
1804 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1805 unsigned DstAlign, unsigned SrcAlign,
1806 bool IsMemset, bool ZeroMemset,
1808 MachineFunction &MF) const {
1809 const Function *F = MF.getFunction();
1810 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1812 (!Subtarget.isUnalignedMem16Slow() ||
1813 ((DstAlign == 0 || DstAlign >= 16) &&
1814 (SrcAlign == 0 || SrcAlign >= 16)))) {
1815 // FIXME: Check if unaligned 32-byte accesses are slow.
1816 if (Size >= 32 && Subtarget.hasAVX()) {
1817 // Although this isn't a well-supported type for AVX1, we'll let
1818 // legalization and shuffle lowering produce the optimal codegen. If we
1819 // choose an optimal type with a vector element larger than a byte,
1820 // getMemsetStores() may create an intermediate splat (using an integer
1821 // multiply) before we splat as a vector.
1824 if (Subtarget.hasSSE2())
1826 // TODO: Can SSE1 handle a byte vector?
1827 if (Subtarget.hasSSE1())
1829 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1830 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1831 // Do not use f64 to lower memcpy if source is string constant. It's
1832 // better to use i32 to avoid the loads.
1833 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1834 // The gymnastics of splatting a byte value into an XMM register and then
1835 // only using 8-byte stores (because this is a CPU with slow unaligned
1836 // 16-byte accesses) makes that a loser.
1840 // This is a compromise. If we reach here, unaligned accesses may be slow on
1841 // this target. However, creating smaller, aligned accesses could be even
1842 // slower and would certainly be a lot more code.
1843 if (Subtarget.is64Bit() && Size >= 8)
1848 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1850 return X86ScalarSSEf32;
1851 else if (VT == MVT::f64)
1852 return X86ScalarSSEf64;
1857 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1862 switch (VT.getSizeInBits()) {
1864 // 8-byte and under are always assumed to be fast.
1868 *Fast = !Subtarget.isUnalignedMem16Slow();
1871 *Fast = !Subtarget.isUnalignedMem32Slow();
1873 // TODO: What about AVX-512 (512-bit) accesses?
1876 // Misaligned accesses of any size are always allowed.
1880 /// Return the entry encoding for a jump table in the
1881 /// current function. The returned value is a member of the
1882 /// MachineJumpTableInfo::JTEntryKind enum.
1883 unsigned X86TargetLowering::getJumpTableEncoding() const {
1884 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1886 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1887 return MachineJumpTableInfo::EK_Custom32;
1889 // Otherwise, use the normal jump table encoding heuristics.
1890 return TargetLowering::getJumpTableEncoding();
1893 bool X86TargetLowering::useSoftFloat() const {
1894 return Subtarget.useSoftFloat();
1897 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1898 ArgListTy &Args) const {
1900 // Only relabel X86-32 for C / Stdcall CCs.
1901 if (Subtarget.is64Bit())
1903 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1905 unsigned ParamRegs = 0;
1906 if (auto *M = MF->getFunction()->getParent())
1907 ParamRegs = M->getNumberRegisterParameters();
1909 // Mark the first N int arguments as having reg
1910 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1911 Type *T = Args[Idx].Ty;
1912 if (T->isPointerTy() || T->isIntegerTy())
1913 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1914 unsigned numRegs = 1;
1915 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1917 if (ParamRegs < numRegs)
1919 ParamRegs -= numRegs;
1920 Args[Idx].IsInReg = true;
1926 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1927 const MachineBasicBlock *MBB,
1928 unsigned uid,MCContext &Ctx) const{
1929 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1930 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1932 return MCSymbolRefExpr::create(MBB->getSymbol(),
1933 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1936 /// Returns relocation base for the given PIC jumptable.
1937 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1938 SelectionDAG &DAG) const {
1939 if (!Subtarget.is64Bit())
1940 // This doesn't have SDLoc associated with it, but is not really the
1941 // same as a Register.
1942 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1943 getPointerTy(DAG.getDataLayout()));
1947 /// This returns the relocation base for the given PIC jumptable,
1948 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1949 const MCExpr *X86TargetLowering::
1950 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1951 MCContext &Ctx) const {
1952 // X86-64 uses RIP relative addressing based on the jump table label.
1953 if (Subtarget.isPICStyleRIPRel())
1954 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1956 // Otherwise, the reference is relative to the PIC base.
1957 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1960 std::pair<const TargetRegisterClass *, uint8_t>
1961 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1963 const TargetRegisterClass *RRC = nullptr;
1965 switch (VT.SimpleTy) {
1967 return TargetLowering::findRepresentativeClass(TRI, VT);
1968 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1969 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1972 RRC = &X86::VR64RegClass;
1974 case MVT::f32: case MVT::f64:
1975 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1976 case MVT::v4f32: case MVT::v2f64:
1977 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1978 case MVT::v8f32: case MVT::v4f64:
1979 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1980 case MVT::v16f32: case MVT::v8f64:
1981 RRC = &X86::VR128XRegClass;
1984 return std::make_pair(RRC, Cost);
1987 unsigned X86TargetLowering::getAddressSpace() const {
1988 if (Subtarget.is64Bit())
1989 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1993 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
1994 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
1995 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
1998 static Constant* SegmentOffset(IRBuilder<> &IRB,
1999 unsigned Offset, unsigned AddressSpace) {
2000 return ConstantExpr::getIntToPtr(
2001 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2002 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2005 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2006 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2007 // tcbhead_t; use it instead of the usual global variable (see
2008 // sysdeps/{i386,x86_64}/nptl/tls.h)
2009 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2010 if (Subtarget.isTargetFuchsia()) {
2011 // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
2012 return SegmentOffset(IRB, 0x10, getAddressSpace());
2014 // %fs:0x28, unless we're using a Kernel code model, in which case
2015 // it's %gs:0x28. gs:0x14 on i386.
2016 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2017 return SegmentOffset(IRB, Offset, getAddressSpace());
2021 return TargetLowering::getIRStackGuard(IRB);
2024 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2025 // MSVC CRT provides functionalities for stack protection.
2026 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2027 // MSVC CRT has a global variable holding security cookie.
2028 M.getOrInsertGlobal("__security_cookie",
2029 Type::getInt8PtrTy(M.getContext()));
2031 // MSVC CRT has a function to validate security cookie.
2032 auto *SecurityCheckCookie = cast<Function>(
2033 M.getOrInsertFunction("__security_check_cookie",
2034 Type::getVoidTy(M.getContext()),
2035 Type::getInt8PtrTy(M.getContext())));
2036 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2037 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2040 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2041 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2043 TargetLowering::insertSSPDeclarations(M);
2046 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2047 // MSVC CRT has a global variable holding security cookie.
2048 if (Subtarget.getTargetTriple().isOSMSVCRT())
2049 return M.getGlobalVariable("__security_cookie");
2050 return TargetLowering::getSDagStackGuard(M);
2053 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2054 // MSVC CRT has a function to validate security cookie.
2055 if (Subtarget.getTargetTriple().isOSMSVCRT())
2056 return M.getFunction("__security_check_cookie");
2057 return TargetLowering::getSSPStackGuardCheck(M);
2060 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2061 if (Subtarget.getTargetTriple().isOSContiki())
2062 return getDefaultSafeStackPointerLocation(IRB, false);
2064 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2065 // definition of TLS_SLOT_SAFESTACK in
2066 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2067 if (Subtarget.isTargetAndroid()) {
2068 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2070 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2071 return SegmentOffset(IRB, Offset, getAddressSpace());
2074 // Fuchsia is similar.
2075 if (Subtarget.isTargetFuchsia()) {
2076 // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2077 return SegmentOffset(IRB, 0x18, getAddressSpace());
2080 return TargetLowering::getSafeStackPointerLocation(IRB);
2083 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2084 unsigned DestAS) const {
2085 assert(SrcAS != DestAS && "Expected different address spaces!");
2087 return SrcAS < 256 && DestAS < 256;
2090 //===----------------------------------------------------------------------===//
2091 // Return Value Calling Convention Implementation
2092 //===----------------------------------------------------------------------===//
2094 #include "X86GenCallingConv.inc"
2096 bool X86TargetLowering::CanLowerReturn(
2097 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2098 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2099 SmallVector<CCValAssign, 16> RVLocs;
2100 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2101 return CCInfo.CheckReturn(Outs, RetCC_X86);
2104 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2105 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2109 /// Lowers masks values (v*i1) to the local register values
2110 /// \returns DAG node after lowering to register type
2111 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2112 const SDLoc &Dl, SelectionDAG &DAG) {
2113 EVT ValVT = ValArg.getValueType();
2115 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2116 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2117 // Two stage lowering might be required
2118 // bitcast: v8i1 -> i8 / v16i1 -> i16
2119 // anyextend: i8 -> i32 / i16 -> i32
2120 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2121 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2122 if (ValLoc == MVT::i32)
2123 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2125 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2126 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2127 // One stage lowering is required
2128 // bitcast: v32i1 -> i32 / v64i1 -> i64
2129 return DAG.getBitcast(ValLoc, ValArg);
2131 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2134 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2135 static void Passv64i1ArgInRegs(
2136 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2137 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2138 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2139 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2140 "Expected AVX512BW or AVX512BMI target!");
2141 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2142 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2143 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2144 "The value should reside in two registers");
2146 // Before splitting the value we cast it to i64
2147 Arg = DAG.getBitcast(MVT::i64, Arg);
2149 // Splitting the value into two i32 types
2151 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2152 DAG.getConstant(0, Dl, MVT::i32));
2153 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2154 DAG.getConstant(1, Dl, MVT::i32));
2156 // Attach the two i32 types into corresponding registers
2157 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2158 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2162 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2164 const SmallVectorImpl<ISD::OutputArg> &Outs,
2165 const SmallVectorImpl<SDValue> &OutVals,
2166 const SDLoc &dl, SelectionDAG &DAG) const {
2167 MachineFunction &MF = DAG.getMachineFunction();
2168 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2170 // In some cases we need to disable registers from the default CSR list.
2171 // For example, when they are used for argument passing.
2172 bool ShouldDisableCalleeSavedRegister =
2173 CallConv == CallingConv::X86_RegCall ||
2174 MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2176 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2177 report_fatal_error("X86 interrupts may not return any value");
2179 SmallVector<CCValAssign, 16> RVLocs;
2180 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2181 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2184 SmallVector<SDValue, 6> RetOps;
2185 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2186 // Operand #1 = Bytes To Pop
2187 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2190 // Copy the result values into the output registers.
2191 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2193 CCValAssign &VA = RVLocs[I];
2194 assert(VA.isRegLoc() && "Can only return in registers!");
2196 // Add the register to the CalleeSaveDisableRegs list.
2197 if (ShouldDisableCalleeSavedRegister)
2198 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2200 SDValue ValToCopy = OutVals[OutsIndex];
2201 EVT ValVT = ValToCopy.getValueType();
2203 // Promote values to the appropriate types.
2204 if (VA.getLocInfo() == CCValAssign::SExt)
2205 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2206 else if (VA.getLocInfo() == CCValAssign::ZExt)
2207 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2208 else if (VA.getLocInfo() == CCValAssign::AExt) {
2209 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2210 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2212 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2214 else if (VA.getLocInfo() == CCValAssign::BCvt)
2215 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2217 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2218 "Unexpected FP-extend for return value.");
2220 // If this is x86-64, and we disabled SSE, we can't return FP values,
2221 // or SSE or MMX vectors.
2222 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2223 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2224 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2225 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2226 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2227 } else if (ValVT == MVT::f64 &&
2228 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2229 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2230 // llvm-gcc has never done it right and no one has noticed, so this
2231 // should be OK for now.
2232 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2233 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2236 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2237 // the RET instruction and handled by the FP Stackifier.
2238 if (VA.getLocReg() == X86::FP0 ||
2239 VA.getLocReg() == X86::FP1) {
2240 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2241 // change the value to the FP stack register class.
2242 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2243 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2244 RetOps.push_back(ValToCopy);
2245 // Don't emit a copytoreg.
2249 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2250 // which is returned in RAX / RDX.
2251 if (Subtarget.is64Bit()) {
2252 if (ValVT == MVT::x86mmx) {
2253 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2254 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2255 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2257 // If we don't have SSE2 available, convert to v4f32 so the generated
2258 // register is legal.
2259 if (!Subtarget.hasSSE2())
2260 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2265 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2267 if (VA.needsCustom()) {
2268 assert(VA.getValVT() == MVT::v64i1 &&
2269 "Currently the only custom case is when we split v64i1 to 2 regs");
2271 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2274 assert(2 == RegsToPass.size() &&
2275 "Expecting two registers after Pass64BitArgInRegs");
2277 // Add the second register to the CalleeSaveDisableRegs list.
2278 if (ShouldDisableCalleeSavedRegister)
2279 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2281 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2284 // Add nodes to the DAG and add the values into the RetOps list
2285 for (auto &Reg : RegsToPass) {
2286 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2287 Flag = Chain.getValue(1);
2288 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2292 // Swift calling convention does not require we copy the sret argument
2293 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2295 // All x86 ABIs require that for returning structs by value we copy
2296 // the sret argument into %rax/%eax (depending on ABI) for the return.
2297 // We saved the argument into a virtual register in the entry block,
2298 // so now we copy the value out and into %rax/%eax.
2300 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2301 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2302 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2303 // either case FuncInfo->setSRetReturnReg() will have been called.
2304 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2305 // When we have both sret and another return value, we should use the
2306 // original Chain stored in RetOps[0], instead of the current Chain updated
2307 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2309 // For the case of sret and another return value, we have
2310 // Chain_0 at the function entry
2311 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2312 // If we use Chain_1 in getCopyFromReg, we will have
2313 // Val = getCopyFromReg(Chain_1)
2314 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2316 // getCopyToReg(Chain_0) will be glued together with
2317 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2318 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2319 // Data dependency from Unit B to Unit A due to usage of Val in
2320 // getCopyToReg(Chain_1, Val)
2321 // Chain dependency from Unit A to Unit B
2323 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2324 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2325 getPointerTy(MF.getDataLayout()));
2328 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2329 X86::RAX : X86::EAX;
2330 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2331 Flag = Chain.getValue(1);
2333 // RAX/EAX now acts like a return value.
2335 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2337 // Add the returned register to the CalleeSaveDisableRegs list.
2338 if (ShouldDisableCalleeSavedRegister)
2339 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2342 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2343 const MCPhysReg *I =
2344 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2347 if (X86::GR64RegClass.contains(*I))
2348 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2350 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2354 RetOps[0] = Chain; // Update chain.
2356 // Add the flag if we have it.
2358 RetOps.push_back(Flag);
2360 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2361 if (CallConv == CallingConv::X86_INTR)
2362 opcode = X86ISD::IRET;
2363 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2366 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2367 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2370 SDValue TCChain = Chain;
2371 SDNode *Copy = *N->use_begin();
2372 if (Copy->getOpcode() == ISD::CopyToReg) {
2373 // If the copy has a glue operand, we conservatively assume it isn't safe to
2374 // perform a tail call.
2375 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2377 TCChain = Copy->getOperand(0);
2378 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2381 bool HasRet = false;
2382 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2384 if (UI->getOpcode() != X86ISD::RET_FLAG)
2386 // If we are returning more than one value, we can definitely
2387 // not make a tail call see PR19530
2388 if (UI->getNumOperands() > 4)
2390 if (UI->getNumOperands() == 4 &&
2391 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2403 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2404 ISD::NodeType ExtendKind) const {
2405 MVT ReturnMVT = MVT::i32;
2407 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2408 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2409 // The ABI does not require i1, i8 or i16 to be extended.
2411 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2412 // always extending i8/i16 return values, so keep doing that for now.
2414 ReturnMVT = MVT::i8;
2417 EVT MinVT = getRegisterType(Context, ReturnMVT);
2418 return VT.bitsLT(MinVT) ? MinVT : VT;
2421 /// Reads two 32 bit registers and creates a 64 bit mask value.
2422 /// \param VA The current 32 bit value that need to be assigned.
2423 /// \param NextVA The next 32 bit value that need to be assigned.
2424 /// \param Root The parent DAG node.
2425 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2426 /// glue purposes. In the case the DAG is already using
2427 /// physical register instead of virtual, we should glue
2428 /// our new SDValue to InFlag SDvalue.
2429 /// \return a new SDvalue of size 64bit.
2430 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2431 SDValue &Root, SelectionDAG &DAG,
2432 const SDLoc &Dl, const X86Subtarget &Subtarget,
2433 SDValue *InFlag = nullptr) {
2434 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2435 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2436 assert(VA.getValVT() == MVT::v64i1 &&
2437 "Expecting first location of 64 bit width type");
2438 assert(NextVA.getValVT() == VA.getValVT() &&
2439 "The locations should have the same type");
2440 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2441 "The values should reside in two registers");
2445 SDValue ArgValueLo, ArgValueHi;
2447 MachineFunction &MF = DAG.getMachineFunction();
2448 const TargetRegisterClass *RC = &X86::GR32RegClass;
2450 // Read a 32 bit value from the registers
2451 if (nullptr == InFlag) {
2452 // When no physical register is present,
2453 // create an intermediate virtual register
2454 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2455 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2456 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2457 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2459 // When a physical register is available read the value from it and glue
2460 // the reads together.
2462 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2463 *InFlag = ArgValueLo.getValue(2);
2465 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2466 *InFlag = ArgValueHi.getValue(2);
2469 // Convert the i32 type into v32i1 type
2470 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2472 // Convert the i32 type into v32i1 type
2473 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2475 // Concatenate the two values together
2476 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2479 /// The function will lower a register of various sizes (8/16/32/64)
2480 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2481 /// \returns a DAG node contains the operand after lowering to mask type.
2482 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2483 const EVT &ValLoc, const SDLoc &Dl,
2484 SelectionDAG &DAG) {
2485 SDValue ValReturned = ValArg;
2487 if (ValVT == MVT::v1i1)
2488 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2490 if (ValVT == MVT::v64i1) {
2491 // In 32 bit machine, this case is handled by getv64i1Argument
2492 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2493 // In 64 bit machine, There is no need to truncate the value only bitcast
2496 switch (ValVT.getSimpleVT().SimpleTy) {
2507 llvm_unreachable("Expecting a vector of i1 types");
2510 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2512 return DAG.getBitcast(ValVT, ValReturned);
2515 /// Lower the result values of a call into the
2516 /// appropriate copies out of appropriate physical registers.
2518 SDValue X86TargetLowering::LowerCallResult(
2519 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2520 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2521 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2522 uint32_t *RegMask) const {
2524 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2525 // Assign locations to each value returned by this call.
2526 SmallVector<CCValAssign, 16> RVLocs;
2527 bool Is64Bit = Subtarget.is64Bit();
2528 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2530 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2532 // Copy all of the result registers out of their specified physreg.
2533 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2535 CCValAssign &VA = RVLocs[I];
2536 EVT CopyVT = VA.getLocVT();
2538 // In some calling conventions we need to remove the used registers
2539 // from the register mask.
2541 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2542 SubRegs.isValid(); ++SubRegs)
2543 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2546 // If this is x86-64, and we disabled SSE, we can't return FP values
2547 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2548 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2549 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2550 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2553 // If we prefer to use the value in xmm registers, copy it out as f80 and
2554 // use a truncate to move it from fp stack reg to xmm reg.
2555 bool RoundAfterCopy = false;
2556 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2557 isScalarFPTypeInSSEReg(VA.getValVT())) {
2558 if (!Subtarget.hasX87())
2559 report_fatal_error("X87 register return with X87 disabled");
2561 RoundAfterCopy = (CopyVT != VA.getLocVT());
2565 if (VA.needsCustom()) {
2566 assert(VA.getValVT() == MVT::v64i1 &&
2567 "Currently the only custom case is when we split v64i1 to 2 regs");
2569 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2571 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2573 Val = Chain.getValue(0);
2574 InFlag = Chain.getValue(2);
2578 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2579 // This truncation won't change the value.
2580 DAG.getIntPtrConstant(1, dl));
2582 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2583 if (VA.getValVT().isVector() &&
2584 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2585 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2586 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2587 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2589 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2592 InVals.push_back(Val);
2598 //===----------------------------------------------------------------------===//
2599 // C & StdCall & Fast Calling Convention implementation
2600 //===----------------------------------------------------------------------===//
2601 // StdCall calling convention seems to be standard for many Windows' API
2602 // routines and around. It differs from C calling convention just a little:
2603 // callee should clean up the stack, not caller. Symbols should be also
2604 // decorated in some fancy way :) It doesn't support any vector arguments.
2605 // For info on fast calling convention see Fast Calling Convention (tail call)
2606 // implementation LowerX86_32FastCCCallTo.
2608 /// CallIsStructReturn - Determines whether a call uses struct return
2610 enum StructReturnType {
2615 static StructReturnType
2616 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2618 return NotStructReturn;
2620 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2621 if (!Flags.isSRet())
2622 return NotStructReturn;
2623 if (Flags.isInReg() || IsMCU)
2624 return RegStructReturn;
2625 return StackStructReturn;
2628 /// Determines whether a function uses struct return semantics.
2629 static StructReturnType
2630 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2632 return NotStructReturn;
2634 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2635 if (!Flags.isSRet())
2636 return NotStructReturn;
2637 if (Flags.isInReg() || IsMCU)
2638 return RegStructReturn;
2639 return StackStructReturn;
2642 /// Make a copy of an aggregate at address specified by "Src" to address
2643 /// "Dst" with size and alignment information specified by the specific
2644 /// parameter attribute. The copy will be passed as a byval function parameter.
2645 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2646 SDValue Chain, ISD::ArgFlagsTy Flags,
2647 SelectionDAG &DAG, const SDLoc &dl) {
2648 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2650 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2651 /*isVolatile*/false, /*AlwaysInline=*/true,
2652 /*isTailCall*/false,
2653 MachinePointerInfo(), MachinePointerInfo());
2656 /// Return true if the calling convention is one that we can guarantee TCO for.
2657 static bool canGuaranteeTCO(CallingConv::ID CC) {
2658 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2659 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2660 CC == CallingConv::HHVM);
2663 /// Return true if we might ever do TCO for calls with this calling convention.
2664 static bool mayTailCallThisCC(CallingConv::ID CC) {
2666 // C calling conventions:
2667 case CallingConv::C:
2668 case CallingConv::X86_64_Win64:
2669 case CallingConv::X86_64_SysV:
2670 // Callee pop conventions:
2671 case CallingConv::X86_ThisCall:
2672 case CallingConv::X86_StdCall:
2673 case CallingConv::X86_VectorCall:
2674 case CallingConv::X86_FastCall:
2677 return canGuaranteeTCO(CC);
2681 /// Return true if the function is being made into a tailcall target by
2682 /// changing its ABI.
2683 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2684 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2687 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2689 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2690 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2693 ImmutableCallSite CS(CI);
2694 CallingConv::ID CalleeCC = CS.getCallingConv();
2695 if (!mayTailCallThisCC(CalleeCC))
2702 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2703 const SmallVectorImpl<ISD::InputArg> &Ins,
2704 const SDLoc &dl, SelectionDAG &DAG,
2705 const CCValAssign &VA,
2706 MachineFrameInfo &MFI, unsigned i) const {
2707 // Create the nodes corresponding to a load from this parameter slot.
2708 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2709 bool AlwaysUseMutable = shouldGuaranteeTCO(
2710 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2711 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2713 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2715 // If value is passed by pointer we have address passed instead of the value
2716 // itself. No need to extend if the mask value and location share the same
2718 bool ExtendedInMem =
2719 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2720 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2722 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2723 ValVT = VA.getLocVT();
2725 ValVT = VA.getValVT();
2727 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2728 // taken by a return address.
2730 if (CallConv == CallingConv::X86_INTR) {
2731 // X86 interrupts may take one or two arguments.
2732 // On the stack there will be no return address as in regular call.
2733 // Offset of last argument need to be set to -4/-8 bytes.
2734 // Where offset of the first argument out of two, should be set to 0 bytes.
2735 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2736 if (Subtarget.is64Bit() && Ins.size() == 2) {
2737 // The stack pointer needs to be realigned for 64 bit handlers with error
2738 // code, so the argument offset changes by 8 bytes.
2743 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2744 // changed with more analysis.
2745 // In case of tail call optimization mark all arguments mutable. Since they
2746 // could be overwritten by lowering of arguments in case of a tail call.
2747 if (Flags.isByVal()) {
2748 unsigned Bytes = Flags.getByValSize();
2749 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2750 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2751 // Adjust SP offset of interrupt parameter.
2752 if (CallConv == CallingConv::X86_INTR) {
2753 MFI.setObjectOffset(FI, Offset);
2755 return DAG.getFrameIndex(FI, PtrVT);
2758 // This is an argument in memory. We might be able to perform copy elision.
2759 if (Flags.isCopyElisionCandidate()) {
2760 EVT ArgVT = Ins[i].ArgVT;
2762 if (Ins[i].PartOffset == 0) {
2763 // If this is a one-part value or the first part of a multi-part value,
2764 // create a stack object for the entire argument value type and return a
2765 // load from our portion of it. This assumes that if the first part of an
2766 // argument is in memory, the rest will also be in memory.
2767 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2768 /*Immutable=*/false);
2769 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2771 ValVT, dl, Chain, PartAddr,
2772 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2774 // This is not the first piece of an argument in memory. See if there is
2775 // already a fixed stack object including this offset. If so, assume it
2776 // was created by the PartOffset == 0 branch above and create a load from
2777 // the appropriate offset into it.
2778 int64_t PartBegin = VA.getLocMemOffset();
2779 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2780 int FI = MFI.getObjectIndexBegin();
2781 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2782 int64_t ObjBegin = MFI.getObjectOffset(FI);
2783 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2784 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2787 if (MFI.isFixedObjectIndex(FI)) {
2789 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2790 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2792 ValVT, dl, Chain, Addr,
2793 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2794 Ins[i].PartOffset));
2799 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2800 VA.getLocMemOffset(), isImmutable);
2802 // Set SExt or ZExt flag.
2803 if (VA.getLocInfo() == CCValAssign::ZExt) {
2804 MFI.setObjectZExt(FI, true);
2805 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2806 MFI.setObjectSExt(FI, true);
2809 // Adjust SP offset of interrupt parameter.
2810 if (CallConv == CallingConv::X86_INTR) {
2811 MFI.setObjectOffset(FI, Offset);
2814 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2815 SDValue Val = DAG.getLoad(
2816 ValVT, dl, Chain, FIN,
2817 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2818 return ExtendedInMem
2819 ? (VA.getValVT().isVector()
2820 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2821 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2825 // FIXME: Get this from tablegen.
2826 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2827 const X86Subtarget &Subtarget) {
2828 assert(Subtarget.is64Bit());
2830 if (Subtarget.isCallingConvWin64(CallConv)) {
2831 static const MCPhysReg GPR64ArgRegsWin64[] = {
2832 X86::RCX, X86::RDX, X86::R8, X86::R9
2834 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2837 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2838 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2840 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2843 // FIXME: Get this from tablegen.
2844 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2845 CallingConv::ID CallConv,
2846 const X86Subtarget &Subtarget) {
2847 assert(Subtarget.is64Bit());
2848 if (Subtarget.isCallingConvWin64(CallConv)) {
2849 // The XMM registers which might contain var arg parameters are shadowed
2850 // in their paired GPR. So we only need to save the GPR to their home
2852 // TODO: __vectorcall will change this.
2856 const Function *Fn = MF.getFunction();
2857 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2858 bool isSoftFloat = Subtarget.useSoftFloat();
2859 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2860 "SSE register cannot be used when SSE is disabled!");
2861 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2862 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2866 static const MCPhysReg XMMArgRegs64Bit[] = {
2867 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2868 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2870 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2874 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2875 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2876 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2877 return A.getValNo() < B.getValNo();
2882 SDValue X86TargetLowering::LowerFormalArguments(
2883 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2884 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2885 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2886 MachineFunction &MF = DAG.getMachineFunction();
2887 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2888 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2890 const Function *Fn = MF.getFunction();
2891 if (Fn->hasExternalLinkage() &&
2892 Subtarget.isTargetCygMing() &&
2893 Fn->getName() == "main")
2894 FuncInfo->setForceFramePointer(true);
2896 MachineFrameInfo &MFI = MF.getFrameInfo();
2897 bool Is64Bit = Subtarget.is64Bit();
2898 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2901 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2902 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2904 if (CallConv == CallingConv::X86_INTR) {
2905 bool isLegal = Ins.size() == 1 ||
2906 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2907 (!Is64Bit && Ins[1].VT == MVT::i32)));
2909 report_fatal_error("X86 interrupts may take one or two arguments");
2912 // Assign locations to all of the incoming arguments.
2913 SmallVector<CCValAssign, 16> ArgLocs;
2914 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2916 // Allocate shadow area for Win64.
2918 CCInfo.AllocateStack(32, 8);
2920 CCInfo.AnalyzeArguments(Ins, CC_X86);
2922 // In vectorcall calling convention a second pass is required for the HVA
2924 if (CallingConv::X86_VectorCall == CallConv) {
2925 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2928 // The next loop assumes that the locations are in the same order of the
2930 assert(isSortedByValueNo(ArgLocs) &&
2931 "Argument Location list must be sorted before lowering");
2934 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2936 assert(InsIndex < Ins.size() && "Invalid Ins index");
2937 CCValAssign &VA = ArgLocs[I];
2939 if (VA.isRegLoc()) {
2940 EVT RegVT = VA.getLocVT();
2941 if (VA.needsCustom()) {
2943 VA.getValVT() == MVT::v64i1 &&
2944 "Currently the only custom case is when we split v64i1 to 2 regs");
2946 // v64i1 values, in regcall calling convention, that are
2947 // compiled to 32 bit arch, are split up into two registers.
2949 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2951 const TargetRegisterClass *RC;
2952 if (RegVT == MVT::i32)
2953 RC = &X86::GR32RegClass;
2954 else if (Is64Bit && RegVT == MVT::i64)
2955 RC = &X86::GR64RegClass;
2956 else if (RegVT == MVT::f32)
2957 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2958 else if (RegVT == MVT::f64)
2959 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2960 else if (RegVT == MVT::f80)
2961 RC = &X86::RFP80RegClass;
2962 else if (RegVT == MVT::f128)
2963 RC = &X86::FR128RegClass;
2964 else if (RegVT.is512BitVector())
2965 RC = &X86::VR512RegClass;
2966 else if (RegVT.is256BitVector())
2967 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2968 else if (RegVT.is128BitVector())
2969 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2970 else if (RegVT == MVT::x86mmx)
2971 RC = &X86::VR64RegClass;
2972 else if (RegVT == MVT::v1i1)
2973 RC = &X86::VK1RegClass;
2974 else if (RegVT == MVT::v8i1)
2975 RC = &X86::VK8RegClass;
2976 else if (RegVT == MVT::v16i1)
2977 RC = &X86::VK16RegClass;
2978 else if (RegVT == MVT::v32i1)
2979 RC = &X86::VK32RegClass;
2980 else if (RegVT == MVT::v64i1)
2981 RC = &X86::VK64RegClass;
2983 llvm_unreachable("Unknown argument type!");
2985 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2986 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2989 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2990 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2992 if (VA.getLocInfo() == CCValAssign::SExt)
2993 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2994 DAG.getValueType(VA.getValVT()));
2995 else if (VA.getLocInfo() == CCValAssign::ZExt)
2996 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2997 DAG.getValueType(VA.getValVT()));
2998 else if (VA.getLocInfo() == CCValAssign::BCvt)
2999 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3001 if (VA.isExtInLoc()) {
3002 // Handle MMX values passed in XMM regs.
3003 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3004 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3005 else if (VA.getValVT().isVector() &&
3006 VA.getValVT().getScalarType() == MVT::i1 &&
3007 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3008 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3009 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3010 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3012 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3015 assert(VA.isMemLoc());
3017 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3020 // If value is passed via pointer - do a load.
3021 if (VA.getLocInfo() == CCValAssign::Indirect)
3023 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3025 InVals.push_back(ArgValue);
3028 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3029 // Swift calling convention does not require we copy the sret argument
3030 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3031 if (CallConv == CallingConv::Swift)
3034 // All x86 ABIs require that for returning structs by value we copy the
3035 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3036 // the argument into a virtual register so that we can access it from the
3038 if (Ins[I].Flags.isSRet()) {
3039 unsigned Reg = FuncInfo->getSRetReturnReg();
3041 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3042 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3043 FuncInfo->setSRetReturnReg(Reg);
3045 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3046 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3051 unsigned StackSize = CCInfo.getNextStackOffset();
3052 // Align stack specially for tail calls.
3053 if (shouldGuaranteeTCO(CallConv,
3054 MF.getTarget().Options.GuaranteedTailCallOpt))
3055 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3057 // If the function takes variable number of arguments, make a frame index for
3058 // the start of the first vararg value... for expansion of llvm.va_start. We
3059 // can skip this if there are no va_start calls.
3060 if (MFI.hasVAStart() &&
3061 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3062 CallConv != CallingConv::X86_ThisCall))) {
3063 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3066 // Figure out if XMM registers are in use.
3067 assert(!(Subtarget.useSoftFloat() &&
3068 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3069 "SSE register cannot be used when SSE is disabled!");
3071 // 64-bit calling conventions support varargs and register parameters, so we
3072 // have to do extra work to spill them in the prologue.
3073 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3074 // Find the first unallocated argument registers.
3075 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3076 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3077 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3078 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3079 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3080 "SSE register cannot be used when SSE is disabled!");
3082 // Gather all the live in physical registers.
3083 SmallVector<SDValue, 6> LiveGPRs;
3084 SmallVector<SDValue, 8> LiveXMMRegs;
3086 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3087 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3089 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3091 if (!ArgXMMs.empty()) {
3092 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3093 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3094 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3095 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3096 LiveXMMRegs.push_back(
3097 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3102 // Get to the caller-allocated home save location. Add 8 to account
3103 // for the return address.
3104 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3105 FuncInfo->setRegSaveFrameIndex(
3106 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3107 // Fixup to set vararg frame on shadow area (4 x i64).
3109 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3111 // For X86-64, if there are vararg parameters that are passed via
3112 // registers, then we must store them to their spots on the stack so
3113 // they may be loaded by dereferencing the result of va_next.
3114 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3115 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3116 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3117 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3120 // Store the integer parameter registers.
3121 SmallVector<SDValue, 8> MemOps;
3122 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3123 getPointerTy(DAG.getDataLayout()));
3124 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3125 for (SDValue Val : LiveGPRs) {
3126 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3127 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3129 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3130 MachinePointerInfo::getFixedStack(
3131 DAG.getMachineFunction(),
3132 FuncInfo->getRegSaveFrameIndex(), Offset));
3133 MemOps.push_back(Store);
3137 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3138 // Now store the XMM (fp + vector) parameter registers.
3139 SmallVector<SDValue, 12> SaveXMMOps;
3140 SaveXMMOps.push_back(Chain);
3141 SaveXMMOps.push_back(ALVal);
3142 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3143 FuncInfo->getRegSaveFrameIndex(), dl));
3144 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3145 FuncInfo->getVarArgsFPOffset(), dl));
3146 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3148 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3149 MVT::Other, SaveXMMOps));
3152 if (!MemOps.empty())
3153 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3156 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3157 // Find the largest legal vector type.
3158 MVT VecVT = MVT::Other;
3159 // FIXME: Only some x86_32 calling conventions support AVX512.
3160 if (Subtarget.hasAVX512() &&
3161 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3162 CallConv == CallingConv::Intel_OCL_BI)))
3163 VecVT = MVT::v16f32;
3164 else if (Subtarget.hasAVX())
3166 else if (Subtarget.hasSSE2())
3169 // We forward some GPRs and some vector types.
3170 SmallVector<MVT, 2> RegParmTypes;
3171 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3172 RegParmTypes.push_back(IntVT);
3173 if (VecVT != MVT::Other)
3174 RegParmTypes.push_back(VecVT);
3176 // Compute the set of forwarded registers. The rest are scratch.
3177 SmallVectorImpl<ForwardedRegister> &Forwards =
3178 FuncInfo->getForwardedMustTailRegParms();
3179 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3181 // Conservatively forward AL on x86_64, since it might be used for varargs.
3182 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3183 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3184 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3187 // Copy all forwards from physical to virtual registers.
3188 for (ForwardedRegister &F : Forwards) {
3189 // FIXME: Can we use a less constrained schedule?
3190 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3191 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3192 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3196 // Some CCs need callee pop.
3197 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3198 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3199 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3200 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3201 // X86 interrupts must pop the error code (and the alignment padding) if
3203 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3205 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3206 // If this is an sret function, the return should pop the hidden pointer.
3207 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3208 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3209 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3210 FuncInfo->setBytesToPopOnReturn(4);
3214 // RegSaveFrameIndex is X86-64 only.
3215 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3216 if (CallConv == CallingConv::X86_FastCall ||
3217 CallConv == CallingConv::X86_ThisCall)
3218 // fastcc functions can't have varargs.
3219 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3222 FuncInfo->setArgumentStackSize(StackSize);
3224 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3225 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3226 if (Personality == EHPersonality::CoreCLR) {
3228 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3229 // that we'd prefer this slot be allocated towards the bottom of the frame
3230 // (i.e. near the stack pointer after allocating the frame). Every
3231 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3232 // offset from the bottom of this and each funclet's frame must be the
3233 // same, so the size of funclets' (mostly empty) frames is dictated by
3234 // how far this slot is from the bottom (since they allocate just enough
3235 // space to accommodate holding this slot at the correct offset).
3236 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3237 EHInfo->PSPSymFrameIdx = PSPSymFI;
3241 if (CallConv == CallingConv::X86_RegCall ||
3242 Fn->hasFnAttribute("no_caller_saved_registers")) {
3243 const MachineRegisterInfo &MRI = MF.getRegInfo();
3244 for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3245 MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3251 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3252 SDValue Arg, const SDLoc &dl,
3254 const CCValAssign &VA,
3255 ISD::ArgFlagsTy Flags) const {
3256 unsigned LocMemOffset = VA.getLocMemOffset();
3257 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3258 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3260 if (Flags.isByVal())
3261 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3263 return DAG.getStore(
3264 Chain, dl, Arg, PtrOff,
3265 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3268 /// Emit a load of return address if tail call
3269 /// optimization is performed and it is required.
3270 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3271 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3272 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3273 // Adjust the Return address stack slot.
3274 EVT VT = getPointerTy(DAG.getDataLayout());
3275 OutRetAddr = getReturnAddressFrameIndex(DAG);
3277 // Load the "old" Return address.
3278 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3279 return SDValue(OutRetAddr.getNode(), 1);
3282 /// Emit a store of the return address if tail call
3283 /// optimization is performed and it is required (FPDiff!=0).
3284 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3285 SDValue Chain, SDValue RetAddrFrIdx,
3286 EVT PtrVT, unsigned SlotSize,
3287 int FPDiff, const SDLoc &dl) {
3288 // Store the return address to the appropriate stack slot.
3289 if (!FPDiff) return Chain;
3290 // Calculate the new stack slot for the return address.
3291 int NewReturnAddrFI =
3292 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3294 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3295 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3296 MachinePointerInfo::getFixedStack(
3297 DAG.getMachineFunction(), NewReturnAddrFI));
3301 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3302 /// operation of specified width.
3303 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3305 unsigned NumElems = VT.getVectorNumElements();
3306 SmallVector<int, 8> Mask;
3307 Mask.push_back(NumElems);
3308 for (unsigned i = 1; i != NumElems; ++i)
3310 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3314 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3315 SmallVectorImpl<SDValue> &InVals) const {
3316 SelectionDAG &DAG = CLI.DAG;
3318 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3319 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3320 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3321 SDValue Chain = CLI.Chain;
3322 SDValue Callee = CLI.Callee;
3323 CallingConv::ID CallConv = CLI.CallConv;
3324 bool &isTailCall = CLI.IsTailCall;
3325 bool isVarArg = CLI.IsVarArg;
3327 MachineFunction &MF = DAG.getMachineFunction();
3328 bool Is64Bit = Subtarget.is64Bit();
3329 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3330 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3331 bool IsSibcall = false;
3332 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3333 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3334 const CallInst *CI =
3335 CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
3336 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3337 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3338 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3340 if (CallConv == CallingConv::X86_INTR)
3341 report_fatal_error("X86 interrupts may not be called directly");
3343 if (Attr.getValueAsString() == "true")
3346 if (Subtarget.isPICStyleGOT() &&
3347 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3348 // If we are using a GOT, disable tail calls to external symbols with
3349 // default visibility. Tail calling such a symbol requires using a GOT
3350 // relocation, which forces early binding of the symbol. This breaks code
3351 // that require lazy function symbol resolution. Using musttail or
3352 // GuaranteedTailCallOpt will override this.
3353 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3354 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3355 G->getGlobal()->hasDefaultVisibility()))
3359 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3361 // Force this to be a tail call. The verifier rules are enough to ensure
3362 // that we can lower this successfully without moving the return address
3365 } else if (isTailCall) {
3366 // Check if it's really possible to do a tail call.
3367 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3368 isVarArg, SR != NotStructReturn,
3369 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3370 Outs, OutVals, Ins, DAG);
3372 // Sibcalls are automatically detected tailcalls which do not require
3374 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3381 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3382 "Var args not supported with calling convention fastcc, ghc or hipe");
3384 // Analyze operands of the call, assigning locations to each operand.
3385 SmallVector<CCValAssign, 16> ArgLocs;
3386 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3388 // Allocate shadow area for Win64.
3390 CCInfo.AllocateStack(32, 8);
3392 CCInfo.AnalyzeArguments(Outs, CC_X86);
3394 // In vectorcall calling convention a second pass is required for the HVA
3396 if (CallingConv::X86_VectorCall == CallConv) {
3397 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3400 // Get a count of how many bytes are to be pushed on the stack.
3401 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3403 // This is a sibcall. The memory operands are available in caller's
3404 // own caller's stack.
3406 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3407 canGuaranteeTCO(CallConv))
3408 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3411 if (isTailCall && !IsSibcall && !IsMustTail) {
3412 // Lower arguments at fp - stackoffset + fpdiff.
3413 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3415 FPDiff = NumBytesCallerPushed - NumBytes;
3417 // Set the delta of movement of the returnaddr stackslot.
3418 // But only set if delta is greater than previous delta.
3419 if (FPDiff < X86Info->getTCReturnAddrDelta())
3420 X86Info->setTCReturnAddrDelta(FPDiff);
3423 unsigned NumBytesToPush = NumBytes;
3424 unsigned NumBytesToPop = NumBytes;
3426 // If we have an inalloca argument, all stack space has already been allocated
3427 // for us and be right at the top of the stack. We don't support multiple
3428 // arguments passed in memory when using inalloca.
3429 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3431 if (!ArgLocs.back().isMemLoc())
3432 report_fatal_error("cannot use inalloca attribute on a register "
3434 if (ArgLocs.back().getLocMemOffset() != 0)
3435 report_fatal_error("any parameter with the inalloca attribute must be "
3436 "the only memory argument");
3440 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3441 NumBytes - NumBytesToPush, dl);
3443 SDValue RetAddrFrIdx;
3444 // Load return address for tail calls.
3445 if (isTailCall && FPDiff)
3446 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3447 Is64Bit, FPDiff, dl);
3449 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3450 SmallVector<SDValue, 8> MemOpChains;
3453 // The next loop assumes that the locations are in the same order of the
3455 assert(isSortedByValueNo(ArgLocs) &&
3456 "Argument Location list must be sorted before lowering");
3458 // Walk the register/memloc assignments, inserting copies/loads. In the case
3459 // of tail call optimization arguments are handle later.
3460 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3461 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3463 assert(OutIndex < Outs.size() && "Invalid Out index");
3464 // Skip inalloca arguments, they have already been written.
3465 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3466 if (Flags.isInAlloca())
3469 CCValAssign &VA = ArgLocs[I];
3470 EVT RegVT = VA.getLocVT();
3471 SDValue Arg = OutVals[OutIndex];
3472 bool isByVal = Flags.isByVal();
3474 // Promote the value if needed.
3475 switch (VA.getLocInfo()) {
3476 default: llvm_unreachable("Unknown loc info!");
3477 case CCValAssign::Full: break;
3478 case CCValAssign::SExt:
3479 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3481 case CCValAssign::ZExt:
3482 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3484 case CCValAssign::AExt:
3485 if (Arg.getValueType().isVector() &&
3486 Arg.getValueType().getVectorElementType() == MVT::i1)
3487 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3488 else if (RegVT.is128BitVector()) {
3489 // Special case: passing MMX values in XMM registers.
3490 Arg = DAG.getBitcast(MVT::i64, Arg);
3491 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3492 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3494 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3496 case CCValAssign::BCvt:
3497 Arg = DAG.getBitcast(RegVT, Arg);
3499 case CCValAssign::Indirect: {
3500 // Store the argument.
3501 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3502 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3503 Chain = DAG.getStore(
3504 Chain, dl, Arg, SpillSlot,
3505 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3511 if (VA.needsCustom()) {
3512 assert(VA.getValVT() == MVT::v64i1 &&
3513 "Currently the only custom case is when we split v64i1 to 2 regs");
3514 // Split v64i1 value into two registers
3515 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3517 } else if (VA.isRegLoc()) {
3518 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3519 if (isVarArg && IsWin64) {
3520 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3521 // shadow reg if callee is a varargs function.
3522 unsigned ShadowReg = 0;
3523 switch (VA.getLocReg()) {
3524 case X86::XMM0: ShadowReg = X86::RCX; break;
3525 case X86::XMM1: ShadowReg = X86::RDX; break;
3526 case X86::XMM2: ShadowReg = X86::R8; break;
3527 case X86::XMM3: ShadowReg = X86::R9; break;
3530 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3532 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3533 assert(VA.isMemLoc());
3534 if (!StackPtr.getNode())
3535 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3536 getPointerTy(DAG.getDataLayout()));
3537 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3538 dl, DAG, VA, Flags));
3542 if (!MemOpChains.empty())
3543 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3545 if (Subtarget.isPICStyleGOT()) {
3546 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3549 RegsToPass.push_back(std::make_pair(
3550 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3551 getPointerTy(DAG.getDataLayout()))));
3553 // If we are tail calling and generating PIC/GOT style code load the
3554 // address of the callee into ECX. The value in ecx is used as target of
3555 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3556 // for tail calls on PIC/GOT architectures. Normally we would just put the
3557 // address of GOT into ebx and then call target@PLT. But for tail calls
3558 // ebx would be restored (since ebx is callee saved) before jumping to the
3561 // Note: The actual moving to ECX is done further down.
3562 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3563 if (G && !G->getGlobal()->hasLocalLinkage() &&
3564 G->getGlobal()->hasDefaultVisibility())
3565 Callee = LowerGlobalAddress(Callee, DAG);
3566 else if (isa<ExternalSymbolSDNode>(Callee))
3567 Callee = LowerExternalSymbol(Callee, DAG);
3571 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3572 // From AMD64 ABI document:
3573 // For calls that may call functions that use varargs or stdargs
3574 // (prototype-less calls or calls to functions containing ellipsis (...) in
3575 // the declaration) %al is used as hidden argument to specify the number
3576 // of SSE registers used. The contents of %al do not need to match exactly
3577 // the number of registers, but must be an ubound on the number of SSE
3578 // registers used and is in the range 0 - 8 inclusive.
3580 // Count the number of XMM registers allocated.
3581 static const MCPhysReg XMMArgRegs[] = {
3582 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3583 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3585 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3586 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3587 && "SSE registers cannot be used when SSE is disabled");
3589 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3590 DAG.getConstant(NumXMMRegs, dl,
3594 if (isVarArg && IsMustTail) {
3595 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3596 for (const auto &F : Forwards) {
3597 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3598 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3602 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3603 // don't need this because the eligibility check rejects calls that require
3604 // shuffling arguments passed in memory.
3605 if (!IsSibcall && isTailCall) {
3606 // Force all the incoming stack arguments to be loaded from the stack
3607 // before any new outgoing arguments are stored to the stack, because the
3608 // outgoing stack slots may alias the incoming argument stack slots, and
3609 // the alias isn't otherwise explicit. This is slightly more conservative
3610 // than necessary, because it means that each store effectively depends
3611 // on every argument instead of just those arguments it would clobber.
3612 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3614 SmallVector<SDValue, 8> MemOpChains2;
3617 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3619 CCValAssign &VA = ArgLocs[I];
3621 if (VA.isRegLoc()) {
3622 if (VA.needsCustom()) {
3623 assert((CallConv == CallingConv::X86_RegCall) &&
3624 "Expecting custom case only in regcall calling convention");
3625 // This means that we are in special case where one argument was
3626 // passed through two register locations - Skip the next location
3633 assert(VA.isMemLoc());
3634 SDValue Arg = OutVals[OutsIndex];
3635 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3636 // Skip inalloca arguments. They don't require any work.
3637 if (Flags.isInAlloca())
3639 // Create frame index.
3640 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3641 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3642 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3643 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3645 if (Flags.isByVal()) {
3646 // Copy relative to framepointer.
3647 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3648 if (!StackPtr.getNode())
3649 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3650 getPointerTy(DAG.getDataLayout()));
3651 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3654 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3658 // Store relative to framepointer.
3659 MemOpChains2.push_back(DAG.getStore(
3660 ArgChain, dl, Arg, FIN,
3661 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3665 if (!MemOpChains2.empty())
3666 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3668 // Store the return address to the appropriate stack slot.
3669 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3670 getPointerTy(DAG.getDataLayout()),
3671 RegInfo->getSlotSize(), FPDiff, dl);
3674 // Build a sequence of copy-to-reg nodes chained together with token chain
3675 // and flag operands which copy the outgoing args into registers.
3677 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3678 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3679 RegsToPass[i].second, InFlag);
3680 InFlag = Chain.getValue(1);
3683 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3684 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3685 // In the 64-bit large code model, we have to make all calls
3686 // through a register, since the call instruction's 32-bit
3687 // pc-relative offset may not be large enough to hold the whole
3689 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3690 // If the callee is a GlobalAddress node (quite common, every direct call
3691 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3693 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3695 // We should use extra load for direct calls to dllimported functions in
3697 const GlobalValue *GV = G->getGlobal();
3698 if (!GV->hasDLLImportStorageClass()) {
3699 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3701 Callee = DAG.getTargetGlobalAddress(
3702 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3704 if (OpFlags == X86II::MO_GOTPCREL) {
3706 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3707 getPointerTy(DAG.getDataLayout()), Callee);
3708 // Add extra indirection
3709 Callee = DAG.getLoad(
3710 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3711 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3714 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3715 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3716 unsigned char OpFlags =
3717 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3719 Callee = DAG.getTargetExternalSymbol(
3720 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3721 } else if (Subtarget.isTarget64BitILP32() &&
3722 Callee->getValueType(0) == MVT::i32) {
3723 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3724 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3727 // Returns a chain & a flag for retval copy to use.
3728 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3729 SmallVector<SDValue, 8> Ops;
3731 if (!IsSibcall && isTailCall) {
3732 Chain = DAG.getCALLSEQ_END(Chain,
3733 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3734 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3735 InFlag = Chain.getValue(1);
3738 Ops.push_back(Chain);
3739 Ops.push_back(Callee);
3742 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3744 // Add argument registers to the end of the list so that they are known live
3746 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3747 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3748 RegsToPass[i].second.getValueType()));
3750 // Add a register mask operand representing the call-preserved registers.
3751 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3752 // set X86_INTR calling convention because it has the same CSR mask
3753 // (same preserved registers).
3754 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3755 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3756 assert(Mask && "Missing call preserved mask for calling convention");
3758 // If this is an invoke in a 32-bit function using a funclet-based
3759 // personality, assume the function clobbers all registers. If an exception
3760 // is thrown, the runtime will not restore CSRs.
3761 // FIXME: Model this more precisely so that we can register allocate across
3762 // the normal edge and spill and fill across the exceptional edge.
3763 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3764 const Function *CallerFn = MF.getFunction();
3765 EHPersonality Pers =
3766 CallerFn->hasPersonalityFn()
3767 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3768 : EHPersonality::Unknown;
3769 if (isFuncletEHPersonality(Pers))
3770 Mask = RegInfo->getNoPreservedMask();
3773 // Define a new register mask from the existing mask.
3774 uint32_t *RegMask = nullptr;
3776 // In some calling conventions we need to remove the used physical registers
3777 // from the reg mask.
3778 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3779 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3781 // Allocate a new Reg Mask and copy Mask.
3782 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3783 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3784 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3786 // Make sure all sub registers of the argument registers are reset
3788 for (auto const &RegPair : RegsToPass)
3789 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3790 SubRegs.isValid(); ++SubRegs)
3791 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3793 // Create the RegMask Operand according to our updated mask.
3794 Ops.push_back(DAG.getRegisterMask(RegMask));
3796 // Create the RegMask Operand according to the static mask.
3797 Ops.push_back(DAG.getRegisterMask(Mask));
3800 if (InFlag.getNode())
3801 Ops.push_back(InFlag);
3805 //// If this is the first return lowered for this function, add the regs
3806 //// to the liveout set for the function.
3807 // This isn't right, although it's probably harmless on x86; liveouts
3808 // should be computed from returns not tail calls. Consider a void
3809 // function making a tail call to a function returning int.
3810 MF.getFrameInfo().setHasTailCall();
3811 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3814 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3815 InFlag = Chain.getValue(1);
3817 // Create the CALLSEQ_END node.
3818 unsigned NumBytesForCalleeToPop;
3819 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3820 DAG.getTarget().Options.GuaranteedTailCallOpt))
3821 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3822 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3823 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3824 SR == StackStructReturn)
3825 // If this is a call to a struct-return function, the callee
3826 // pops the hidden struct pointer, so we have to push it back.
3827 // This is common for Darwin/X86, Linux & Mingw32 targets.
3828 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3829 NumBytesForCalleeToPop = 4;
3831 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3833 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3834 // No need to reset the stack after the call if the call doesn't return. To
3835 // make the MI verify, we'll pretend the callee does it for us.
3836 NumBytesForCalleeToPop = NumBytes;
3839 // Returns a flag for retval copy to use.
3841 Chain = DAG.getCALLSEQ_END(Chain,
3842 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3843 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3846 InFlag = Chain.getValue(1);
3849 // Handle result values, copying them out of physregs into vregs that we
3851 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3855 //===----------------------------------------------------------------------===//
3856 // Fast Calling Convention (tail call) implementation
3857 //===----------------------------------------------------------------------===//
3859 // Like std call, callee cleans arguments, convention except that ECX is
3860 // reserved for storing the tail called function address. Only 2 registers are
3861 // free for argument passing (inreg). Tail call optimization is performed
3863 // * tailcallopt is enabled
3864 // * caller/callee are fastcc
3865 // On X86_64 architecture with GOT-style position independent code only local
3866 // (within module) calls are supported at the moment.
3867 // To keep the stack aligned according to platform abi the function
3868 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3869 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3870 // If a tail called function callee has more arguments than the caller the
3871 // caller needs to make sure that there is room to move the RETADDR to. This is
3872 // achieved by reserving an area the size of the argument delta right after the
3873 // original RETADDR, but before the saved framepointer or the spilled registers
3874 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3886 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3889 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3890 SelectionDAG& DAG) const {
3891 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3892 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3893 unsigned StackAlignment = TFI.getStackAlignment();
3894 uint64_t AlignMask = StackAlignment - 1;
3895 int64_t Offset = StackSize;
3896 unsigned SlotSize = RegInfo->getSlotSize();
3897 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3898 // Number smaller than 12 so just add the difference.
3899 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3901 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3902 Offset = ((~AlignMask) & Offset) + StackAlignment +
3903 (StackAlignment-SlotSize);
3908 /// Return true if the given stack call argument is already available in the
3909 /// same position (relatively) of the caller's incoming argument stack.
3911 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3912 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3913 const X86InstrInfo *TII, const CCValAssign &VA) {
3914 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3917 // Look through nodes that don't alter the bits of the incoming value.
3918 unsigned Op = Arg.getOpcode();
3919 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3920 Arg = Arg.getOperand(0);
3923 if (Op == ISD::TRUNCATE) {
3924 const SDValue &TruncInput = Arg.getOperand(0);
3925 if (TruncInput.getOpcode() == ISD::AssertZext &&
3926 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3927 Arg.getValueType()) {
3928 Arg = TruncInput.getOperand(0);
3936 if (Arg.getOpcode() == ISD::CopyFromReg) {
3937 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3938 if (!TargetRegisterInfo::isVirtualRegister(VR))
3940 MachineInstr *Def = MRI->getVRegDef(VR);
3943 if (!Flags.isByVal()) {
3944 if (!TII->isLoadFromStackSlot(*Def, FI))
3947 unsigned Opcode = Def->getOpcode();
3948 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3949 Opcode == X86::LEA64_32r) &&
3950 Def->getOperand(1).isFI()) {
3951 FI = Def->getOperand(1).getIndex();
3952 Bytes = Flags.getByValSize();
3956 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3957 if (Flags.isByVal())
3958 // ByVal argument is passed in as a pointer but it's now being
3959 // dereferenced. e.g.
3960 // define @foo(%struct.X* %A) {
3961 // tail call @bar(%struct.X* byval %A)
3964 SDValue Ptr = Ld->getBasePtr();
3965 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3968 FI = FINode->getIndex();
3969 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3970 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3971 FI = FINode->getIndex();
3972 Bytes = Flags.getByValSize();
3976 assert(FI != INT_MAX);
3977 if (!MFI.isFixedObjectIndex(FI))
3980 if (Offset != MFI.getObjectOffset(FI))
3983 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3984 // If the argument location is wider than the argument type, check that any
3985 // extension flags match.
3986 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3987 Flags.isSExt() != MFI.isObjectSExt(FI)) {
3992 return Bytes == MFI.getObjectSize(FI);
3995 /// Check whether the call is eligible for tail call optimization. Targets
3996 /// that want to do tail call optimization should implement this function.
3997 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3998 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3999 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4000 const SmallVectorImpl<ISD::OutputArg> &Outs,
4001 const SmallVectorImpl<SDValue> &OutVals,
4002 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4003 if (!mayTailCallThisCC(CalleeCC))
4006 // If -tailcallopt is specified, make fastcc functions tail-callable.
4007 MachineFunction &MF = DAG.getMachineFunction();
4008 const Function *CallerF = MF.getFunction();
4010 // If the function return type is x86_fp80 and the callee return type is not,
4011 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4012 // perform a tailcall optimization here.
4013 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4016 CallingConv::ID CallerCC = CallerF->getCallingConv();
4017 bool CCMatch = CallerCC == CalleeCC;
4018 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4019 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4021 // Win64 functions have extra shadow space for argument homing. Don't do the
4022 // sibcall if the caller and callee have mismatched expectations for this
4024 if (IsCalleeWin64 != IsCallerWin64)
4027 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4028 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4033 // Look for obvious safe cases to perform tail call optimization that do not
4034 // require ABI changes. This is what gcc calls sibcall.
4036 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4037 // emit a special epilogue.
4038 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4039 if (RegInfo->needsStackRealignment(MF))
4042 // Also avoid sibcall optimization if either caller or callee uses struct
4043 // return semantics.
4044 if (isCalleeStructRet || isCallerStructRet)
4047 // Do not sibcall optimize vararg calls unless all arguments are passed via
4049 LLVMContext &C = *DAG.getContext();
4050 if (isVarArg && !Outs.empty()) {
4051 // Optimizing for varargs on Win64 is unlikely to be safe without
4052 // additional testing.
4053 if (IsCalleeWin64 || IsCallerWin64)
4056 SmallVector<CCValAssign, 16> ArgLocs;
4057 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4059 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4060 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4061 if (!ArgLocs[i].isRegLoc())
4065 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4066 // stack. Therefore, if it's not used by the call it is not safe to optimize
4067 // this into a sibcall.
4068 bool Unused = false;
4069 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4076 SmallVector<CCValAssign, 16> RVLocs;
4077 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4078 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4079 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4080 CCValAssign &VA = RVLocs[i];
4081 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4086 // Check that the call results are passed in the same way.
4087 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4088 RetCC_X86, RetCC_X86))
4090 // The callee has to preserve all registers the caller needs to preserve.
4091 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4092 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4094 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4095 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4099 unsigned StackArgsSize = 0;
4101 // If the callee takes no arguments then go on to check the results of the
4103 if (!Outs.empty()) {
4104 // Check if stack adjustment is needed. For now, do not do this if any
4105 // argument is passed on the stack.
4106 SmallVector<CCValAssign, 16> ArgLocs;
4107 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4109 // Allocate shadow area for Win64
4111 CCInfo.AllocateStack(32, 8);
4113 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4114 StackArgsSize = CCInfo.getNextStackOffset();
4116 if (CCInfo.getNextStackOffset()) {
4117 // Check if the arguments are already laid out in the right way as
4118 // the caller's fixed stack objects.
4119 MachineFrameInfo &MFI = MF.getFrameInfo();
4120 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4121 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4122 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4123 CCValAssign &VA = ArgLocs[i];
4124 SDValue Arg = OutVals[i];
4125 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4126 if (VA.getLocInfo() == CCValAssign::Indirect)
4128 if (!VA.isRegLoc()) {
4129 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4136 bool PositionIndependent = isPositionIndependent();
4137 // If the tailcall address may be in a register, then make sure it's
4138 // possible to register allocate for it. In 32-bit, the call address can
4139 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4140 // callee-saved registers are restored. These happen to be the same
4141 // registers used to pass 'inreg' arguments so watch out for those.
4142 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4143 !isa<ExternalSymbolSDNode>(Callee)) ||
4144 PositionIndependent)) {
4145 unsigned NumInRegs = 0;
4146 // In PIC we need an extra register to formulate the address computation
4148 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4150 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4151 CCValAssign &VA = ArgLocs[i];
4154 unsigned Reg = VA.getLocReg();
4157 case X86::EAX: case X86::EDX: case X86::ECX:
4158 if (++NumInRegs == MaxInRegs)
4165 const MachineRegisterInfo &MRI = MF.getRegInfo();
4166 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4170 bool CalleeWillPop =
4171 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4172 MF.getTarget().Options.GuaranteedTailCallOpt);
4174 if (unsigned BytesToPop =
4175 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4176 // If we have bytes to pop, the callee must pop them.
4177 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4178 if (!CalleePopMatches)
4180 } else if (CalleeWillPop && StackArgsSize > 0) {
4181 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4189 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4190 const TargetLibraryInfo *libInfo) const {
4191 return X86::createFastISel(funcInfo, libInfo);
4194 //===----------------------------------------------------------------------===//
4195 // Other Lowering Hooks
4196 //===----------------------------------------------------------------------===//
4198 static bool MayFoldLoad(SDValue Op) {
4199 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4202 static bool MayFoldIntoStore(SDValue Op) {
4203 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4206 static bool MayFoldIntoZeroExtend(SDValue Op) {
4207 if (Op.hasOneUse()) {
4208 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4209 return (ISD::ZERO_EXTEND == Opcode);
4214 static bool isTargetShuffle(unsigned Opcode) {
4216 default: return false;
4217 case X86ISD::BLENDI:
4218 case X86ISD::PSHUFB:
4219 case X86ISD::PSHUFD:
4220 case X86ISD::PSHUFHW:
4221 case X86ISD::PSHUFLW:
4223 case X86ISD::INSERTPS:
4224 case X86ISD::PALIGNR:
4225 case X86ISD::VSHLDQ:
4226 case X86ISD::VSRLDQ:
4227 case X86ISD::MOVLHPS:
4228 case X86ISD::MOVLHPD:
4229 case X86ISD::MOVHLPS:
4230 case X86ISD::MOVLPS:
4231 case X86ISD::MOVLPD:
4232 case X86ISD::MOVSHDUP:
4233 case X86ISD::MOVSLDUP:
4234 case X86ISD::MOVDDUP:
4237 case X86ISD::UNPCKL:
4238 case X86ISD::UNPCKH:
4239 case X86ISD::VBROADCAST:
4240 case X86ISD::VPERMILPI:
4241 case X86ISD::VPERMILPV:
4242 case X86ISD::VPERM2X128:
4243 case X86ISD::VPERMIL2:
4244 case X86ISD::VPERMI:
4245 case X86ISD::VPPERM:
4246 case X86ISD::VPERMV:
4247 case X86ISD::VPERMV3:
4248 case X86ISD::VPERMIV3:
4249 case X86ISD::VZEXT_MOVL:
4254 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4256 default: return false;
4258 case X86ISD::PSHUFB:
4259 case X86ISD::VPERMILPV:
4260 case X86ISD::VPERMIL2:
4261 case X86ISD::VPPERM:
4262 case X86ISD::VPERMV:
4263 case X86ISD::VPERMV3:
4264 case X86ISD::VPERMIV3:
4266 // 'Faux' Target Shuffles.
4273 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4274 MachineFunction &MF = DAG.getMachineFunction();
4275 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4276 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4277 int ReturnAddrIndex = FuncInfo->getRAIndex();
4279 if (ReturnAddrIndex == 0) {
4280 // Set up a frame object for the return address.
4281 unsigned SlotSize = RegInfo->getSlotSize();
4282 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4285 FuncInfo->setRAIndex(ReturnAddrIndex);
4288 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4291 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4292 bool hasSymbolicDisplacement) {
4293 // Offset should fit into 32 bit immediate field.
4294 if (!isInt<32>(Offset))
4297 // If we don't have a symbolic displacement - we don't have any extra
4299 if (!hasSymbolicDisplacement)
4302 // FIXME: Some tweaks might be needed for medium code model.
4303 if (M != CodeModel::Small && M != CodeModel::Kernel)
4306 // For small code model we assume that latest object is 16MB before end of 31
4307 // bits boundary. We may also accept pretty large negative constants knowing
4308 // that all objects are in the positive half of address space.
4309 if (M == CodeModel::Small && Offset < 16*1024*1024)
4312 // For kernel code model we know that all object resist in the negative half
4313 // of 32bits address space. We may not accept negative offsets, since they may
4314 // be just off and we may accept pretty large positive ones.
4315 if (M == CodeModel::Kernel && Offset >= 0)
4321 /// Determines whether the callee is required to pop its own arguments.
4322 /// Callee pop is necessary to support tail calls.
4323 bool X86::isCalleePop(CallingConv::ID CallingConv,
4324 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4325 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4326 // can guarantee TCO.
4327 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4330 switch (CallingConv) {
4333 case CallingConv::X86_StdCall:
4334 case CallingConv::X86_FastCall:
4335 case CallingConv::X86_ThisCall:
4336 case CallingConv::X86_VectorCall:
4341 /// \brief Return true if the condition is an unsigned comparison operation.
4342 static bool isX86CCUnsigned(unsigned X86CC) {
4345 llvm_unreachable("Invalid integer condition!");
4361 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4362 switch (SetCCOpcode) {
4363 default: llvm_unreachable("Invalid integer condition!");
4364 case ISD::SETEQ: return X86::COND_E;
4365 case ISD::SETGT: return X86::COND_G;
4366 case ISD::SETGE: return X86::COND_GE;
4367 case ISD::SETLT: return X86::COND_L;
4368 case ISD::SETLE: return X86::COND_LE;
4369 case ISD::SETNE: return X86::COND_NE;
4370 case ISD::SETULT: return X86::COND_B;
4371 case ISD::SETUGT: return X86::COND_A;
4372 case ISD::SETULE: return X86::COND_BE;
4373 case ISD::SETUGE: return X86::COND_AE;
4377 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4378 /// condition code, returning the condition code and the LHS/RHS of the
4379 /// comparison to make.
4380 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4381 bool isFP, SDValue &LHS, SDValue &RHS,
4382 SelectionDAG &DAG) {
4384 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4385 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4386 // X > -1 -> X == 0, jump !sign.
4387 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4388 return X86::COND_NS;
4390 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4391 // X < 0 -> X == 0, jump on sign.
4394 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4396 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4397 return X86::COND_LE;
4401 return TranslateIntegerX86CC(SetCCOpcode);
4404 // First determine if it is required or is profitable to flip the operands.
4406 // If LHS is a foldable load, but RHS is not, flip the condition.
4407 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4408 !ISD::isNON_EXTLoad(RHS.getNode())) {
4409 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4410 std::swap(LHS, RHS);
4413 switch (SetCCOpcode) {
4419 std::swap(LHS, RHS);
4423 // On a floating point condition, the flags are set as follows:
4425 // 0 | 0 | 0 | X > Y
4426 // 0 | 0 | 1 | X < Y
4427 // 1 | 0 | 0 | X == Y
4428 // 1 | 1 | 1 | unordered
4429 switch (SetCCOpcode) {
4430 default: llvm_unreachable("Condcode should be pre-legalized away");
4432 case ISD::SETEQ: return X86::COND_E;
4433 case ISD::SETOLT: // flipped
4435 case ISD::SETGT: return X86::COND_A;
4436 case ISD::SETOLE: // flipped
4438 case ISD::SETGE: return X86::COND_AE;
4439 case ISD::SETUGT: // flipped
4441 case ISD::SETLT: return X86::COND_B;
4442 case ISD::SETUGE: // flipped
4444 case ISD::SETLE: return X86::COND_BE;
4446 case ISD::SETNE: return X86::COND_NE;
4447 case ISD::SETUO: return X86::COND_P;
4448 case ISD::SETO: return X86::COND_NP;
4450 case ISD::SETUNE: return X86::COND_INVALID;
4454 /// Is there a floating point cmov for the specific X86 condition code?
4455 /// Current x86 isa includes the following FP cmov instructions:
4456 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4457 static bool hasFPCMov(unsigned X86CC) {
4474 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4476 unsigned Intrinsic) const {
4478 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4482 Info.opc = ISD::INTRINSIC_W_CHAIN;
4483 Info.readMem = false;
4484 Info.writeMem = false;
4488 switch (IntrData->Type) {
4489 case EXPAND_FROM_MEM: {
4490 Info.ptrVal = I.getArgOperand(0);
4491 Info.memVT = MVT::getVT(I.getType());
4493 Info.readMem = true;
4496 case COMPRESS_TO_MEM: {
4497 Info.ptrVal = I.getArgOperand(0);
4498 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4500 Info.writeMem = true;
4503 case TRUNCATE_TO_MEM_VI8:
4504 case TRUNCATE_TO_MEM_VI16:
4505 case TRUNCATE_TO_MEM_VI32: {
4506 Info.ptrVal = I.getArgOperand(0);
4507 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4508 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4509 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4511 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4512 ScalarVT = MVT::i16;
4513 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4514 ScalarVT = MVT::i32;
4516 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4518 Info.writeMem = true;
4528 /// Returns true if the target can instruction select the
4529 /// specified FP immediate natively. If false, the legalizer will
4530 /// materialize the FP immediate as a load from a constant pool.
4531 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4532 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4533 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4539 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4540 ISD::LoadExtType ExtTy,
4542 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4543 // relocation target a movq or addq instruction: don't let the load shrink.
4544 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4545 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4546 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4547 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4551 /// \brief Returns true if it is beneficial to convert a load of a constant
4552 /// to just the constant itself.
4553 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4555 assert(Ty->isIntegerTy());
4557 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4558 if (BitSize == 0 || BitSize > 64)
4563 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4564 unsigned Index) const {
4565 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4568 return (Index == 0 || Index == ResVT.getVectorNumElements());
4571 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4572 // Speculate cttz only if we can directly use TZCNT.
4573 return Subtarget.hasBMI();
4576 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4577 // Speculate ctlz only if we can directly use LZCNT.
4578 return Subtarget.hasLZCNT();
4581 bool X86TargetLowering::isCtlzFast() const {
4582 return Subtarget.hasFastLZCNT();
4585 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4586 const Instruction &AndI) const {
4590 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4591 if (!Subtarget.hasBMI())
4594 // There are only 32-bit and 64-bit forms for 'andn'.
4595 EVT VT = Y.getValueType();
4596 if (VT != MVT::i32 && VT != MVT::i64)
4602 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4603 MVT VT = MVT::getIntegerVT(NumBits);
4604 if (isTypeLegal(VT))
4607 // PMOVMSKB can handle this.
4608 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4611 // VPMOVMSKB can handle this.
4612 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4615 // TODO: Allow 64-bit type for 32-bit target.
4616 // TODO: 512-bit types should be allowed, but make sure that those
4617 // cases are handled in combineVectorSizedSetCCEquality().
4619 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4622 /// Val is the undef sentinel value or equal to the specified value.
4623 static bool isUndefOrEqual(int Val, int CmpVal) {
4624 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4627 /// Val is either the undef or zero sentinel value.
4628 static bool isUndefOrZero(int Val) {
4629 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4632 /// Return true if every element in Mask, beginning
4633 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4634 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4635 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4636 if (Mask[i] != SM_SentinelUndef)
4641 /// Return true if Val is undef or if its value falls within the
4642 /// specified range (L, H].
4643 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4644 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4647 /// Return true if every element in Mask is undef or if its value
4648 /// falls within the specified range (L, H].
4649 static bool isUndefOrInRange(ArrayRef<int> Mask,
4652 if (!isUndefOrInRange(M, Low, Hi))
4657 /// Return true if Val is undef, zero or if its value falls within the
4658 /// specified range (L, H].
4659 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4660 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4663 /// Return true if every element in Mask is undef, zero or if its value
4664 /// falls within the specified range (L, H].
4665 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4667 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4672 /// Return true if every element in Mask, beginning
4673 /// from position Pos and ending in Pos+Size, falls within the specified
4674 /// sequential range (Low, Low+Size]. or is undef.
4675 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4676 unsigned Pos, unsigned Size, int Low) {
4677 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4678 if (!isUndefOrEqual(Mask[i], Low))
4683 /// Return true if every element in Mask, beginning
4684 /// from position Pos and ending in Pos+Size, falls within the specified
4685 /// sequential range (Low, Low+Size], or is undef or is zero.
4686 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4687 unsigned Size, int Low) {
4688 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4689 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4694 /// Return true if every element in Mask, beginning
4695 /// from position Pos and ending in Pos+Size is undef or is zero.
4696 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4698 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4699 if (!isUndefOrZero(Mask[i]))
4704 /// \brief Helper function to test whether a shuffle mask could be
4705 /// simplified by widening the elements being shuffled.
4707 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4708 /// leaves it in an unspecified state.
4710 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4711 /// shuffle masks. The latter have the special property of a '-2' representing
4712 /// a zero-ed lane of a vector.
4713 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4714 SmallVectorImpl<int> &WidenedMask) {
4715 WidenedMask.assign(Mask.size() / 2, 0);
4716 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4718 int M1 = Mask[i + 1];
4720 // If both elements are undef, its trivial.
4721 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4722 WidenedMask[i / 2] = SM_SentinelUndef;
4726 // Check for an undef mask and a mask value properly aligned to fit with
4727 // a pair of values. If we find such a case, use the non-undef mask's value.
4728 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4729 WidenedMask[i / 2] = M1 / 2;
4732 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4733 WidenedMask[i / 2] = M0 / 2;
4737 // When zeroing, we need to spread the zeroing across both lanes to widen.
4738 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4739 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4740 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4741 WidenedMask[i / 2] = SM_SentinelZero;
4747 // Finally check if the two mask values are adjacent and aligned with
4749 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4750 WidenedMask[i / 2] = M0 / 2;
4754 // Otherwise we can't safely widen the elements used in this shuffle.
4757 assert(WidenedMask.size() == Mask.size() / 2 &&
4758 "Incorrect size of mask after widening the elements!");
4763 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4764 /// mask index with the scaled sequential indices for an equivalent narrowed
4765 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4767 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4768 SmallVectorImpl<int> &ScaledMask) {
4769 assert(0 < Scale && "Unexpected scaling factor");
4770 int NumElts = Mask.size();
4771 ScaledMask.assign(NumElts * Scale, -1);
4773 for (int i = 0; i != NumElts; ++i) {
4776 // Repeat sentinel values in every mask element.
4778 for (int s = 0; s != Scale; ++s)
4779 ScaledMask[(Scale * i) + s] = M;
4783 // Scale mask element and increment across each mask element.
4784 for (int s = 0; s != Scale; ++s)
4785 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4789 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4790 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4791 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4792 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4793 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4796 // The index should be aligned on a vecWidth-bit boundary.
4797 uint64_t Index = N->getConstantOperandVal(1);
4798 MVT VT = N->getSimpleValueType(0);
4799 unsigned ElSize = VT.getScalarSizeInBits();
4800 return (Index * ElSize) % vecWidth == 0;
4803 /// Return true if the specified INSERT_SUBVECTOR
4804 /// operand specifies a subvector insert that is suitable for input to
4805 /// insertion of 128 or 256-bit subvectors
4806 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4807 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4808 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4811 // The index should be aligned on a vecWidth-bit boundary.
4812 uint64_t Index = N->getConstantOperandVal(2);
4813 MVT VT = N->getSimpleValueType(0);
4814 unsigned ElSize = VT.getScalarSizeInBits();
4815 return (Index * ElSize) % vecWidth == 0;
4818 bool X86::isVINSERT128Index(SDNode *N) {
4819 return isVINSERTIndex(N, 128);
4822 bool X86::isVINSERT256Index(SDNode *N) {
4823 return isVINSERTIndex(N, 256);
4826 bool X86::isVEXTRACT128Index(SDNode *N) {
4827 return isVEXTRACTIndex(N, 128);
4830 bool X86::isVEXTRACT256Index(SDNode *N) {
4831 return isVEXTRACTIndex(N, 256);
4834 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4835 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4836 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4837 "Illegal extract subvector for VEXTRACT");
4839 uint64_t Index = N->getConstantOperandVal(1);
4840 MVT VecVT = N->getOperand(0).getSimpleValueType();
4841 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4842 return Index / NumElemsPerChunk;
4845 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4846 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4847 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4848 "Illegal insert subvector for VINSERT");
4850 uint64_t Index = N->getConstantOperandVal(2);
4851 MVT VecVT = N->getSimpleValueType(0);
4852 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4853 return Index / NumElemsPerChunk;
4856 /// Return the appropriate immediate to extract the specified
4857 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4858 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4859 return getExtractVEXTRACTImmediate(N, 128);
4862 /// Return the appropriate immediate to extract the specified
4863 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4864 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4865 return getExtractVEXTRACTImmediate(N, 256);
4868 /// Return the appropriate immediate to insert at the specified
4869 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4870 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4871 return getInsertVINSERTImmediate(N, 128);
4874 /// Return the appropriate immediate to insert at the specified
4875 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4876 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4877 return getInsertVINSERTImmediate(N, 256);
4880 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4881 bool X86::isZeroNode(SDValue Elt) {
4882 return isNullConstant(Elt) || isNullFPConstant(Elt);
4885 // Build a vector of constants.
4886 // Use an UNDEF node if MaskElt == -1.
4887 // Split 64-bit constants in the 32-bit mode.
4888 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4889 const SDLoc &dl, bool IsMask = false) {
4891 SmallVector<SDValue, 32> Ops;
4894 MVT ConstVecVT = VT;
4895 unsigned NumElts = VT.getVectorNumElements();
4896 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4897 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4898 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4902 MVT EltVT = ConstVecVT.getVectorElementType();
4903 for (unsigned i = 0; i < NumElts; ++i) {
4904 bool IsUndef = Values[i] < 0 && IsMask;
4905 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4906 DAG.getConstant(Values[i], dl, EltVT);
4907 Ops.push_back(OpNode);
4909 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4910 DAG.getConstant(0, dl, EltVT));
4912 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4914 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4918 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4919 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4920 assert(Bits.size() == Undefs.getBitWidth() &&
4921 "Unequal constant and undef arrays");
4922 SmallVector<SDValue, 32> Ops;
4925 MVT ConstVecVT = VT;
4926 unsigned NumElts = VT.getVectorNumElements();
4927 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4928 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4929 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4933 MVT EltVT = ConstVecVT.getVectorElementType();
4934 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4936 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4939 const APInt &V = Bits[i];
4940 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4942 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4943 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4944 } else if (EltVT == MVT::f32) {
4945 APFloat FV(APFloat::IEEEsingle(), V);
4946 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4947 } else if (EltVT == MVT::f64) {
4948 APFloat FV(APFloat::IEEEdouble(), V);
4949 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4951 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4955 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4956 return DAG.getBitcast(VT, ConstsNode);
4959 /// Returns a vector of specified type with all zero elements.
4960 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4961 SelectionDAG &DAG, const SDLoc &dl) {
4962 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4963 VT.getVectorElementType() == MVT::i1) &&
4964 "Unexpected vector type");
4966 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4967 // type. This ensures they get CSE'd. But if the integer type is not
4968 // available, use a floating-point +0.0 instead.
4970 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4971 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4972 } else if (VT.getVectorElementType() == MVT::i1) {
4973 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4974 "Unexpected vector type");
4975 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4976 "Unexpected vector type");
4977 Vec = DAG.getConstant(0, dl, VT);
4979 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4980 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4982 return DAG.getBitcast(VT, Vec);
4985 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4986 const SDLoc &dl, unsigned vectorWidth) {
4987 EVT VT = Vec.getValueType();
4988 EVT ElVT = VT.getVectorElementType();
4989 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4990 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4991 VT.getVectorNumElements()/Factor);
4993 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4994 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4995 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4997 // This is the index of the first element of the vectorWidth-bit chunk
4998 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4999 IdxVal &= ~(ElemsPerChunk - 1);
5001 // If the input is a buildvector just emit a smaller one.
5002 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5003 return DAG.getBuildVector(
5004 ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
5006 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5007 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5010 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5011 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5012 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5013 /// instructions or a simple subregister reference. Idx is an index in the
5014 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5015 /// lowering EXTRACT_VECTOR_ELT operations easier.
5016 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5017 SelectionDAG &DAG, const SDLoc &dl) {
5018 assert((Vec.getValueType().is256BitVector() ||
5019 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5020 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5023 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5024 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5025 SelectionDAG &DAG, const SDLoc &dl) {
5026 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5027 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5030 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5031 SelectionDAG &DAG, const SDLoc &dl,
5032 unsigned vectorWidth) {
5033 assert((vectorWidth == 128 || vectorWidth == 256) &&
5034 "Unsupported vector width");
5035 // Inserting UNDEF is Result
5038 EVT VT = Vec.getValueType();
5039 EVT ElVT = VT.getVectorElementType();
5040 EVT ResultVT = Result.getValueType();
5042 // Insert the relevant vectorWidth bits.
5043 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5044 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5046 // This is the index of the first element of the vectorWidth-bit chunk
5047 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5048 IdxVal &= ~(ElemsPerChunk - 1);
5050 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5051 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5054 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5055 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5056 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5057 /// simple superregister reference. Idx is an index in the 128 bits
5058 /// we want. It need not be aligned to a 128-bit boundary. That makes
5059 /// lowering INSERT_VECTOR_ELT operations easier.
5060 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5061 SelectionDAG &DAG, const SDLoc &dl) {
5062 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5063 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5066 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5067 SelectionDAG &DAG, const SDLoc &dl) {
5068 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5069 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5072 /// Insert i1-subvector to i1-vector.
5073 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5074 const X86Subtarget &Subtarget) {
5077 SDValue Vec = Op.getOperand(0);
5078 SDValue SubVec = Op.getOperand(1);
5079 SDValue Idx = Op.getOperand(2);
5081 if (!isa<ConstantSDNode>(Idx))
5084 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5085 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5088 MVT OpVT = Op.getSimpleValueType();
5089 MVT SubVecVT = SubVec.getSimpleValueType();
5090 unsigned NumElems = OpVT.getVectorNumElements();
5091 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5093 assert(IdxVal + SubVecNumElems <= NumElems &&
5094 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5095 "Unexpected index value in INSERT_SUBVECTOR");
5097 // There are 3 possible cases:
5098 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5099 // 2. Subvector should be inserted in the upper part
5100 // (IdxVal + SubVecNumElems == NumElems)
5101 // 3. Subvector should be inserted in the middle (for example v2i1
5102 // to v16i1, index 2)
5104 // extend to natively supported kshift
5105 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5106 MVT WideOpVT = OpVT;
5107 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5110 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5111 SDValue Undef = DAG.getUNDEF(WideOpVT);
5112 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5113 Undef, SubVec, ZeroIdx);
5115 // Extract sub-vector if require.
5116 auto ExtractSubVec = [&](SDValue V) {
5117 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5121 if (Vec.isUndef()) {
5123 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5124 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5127 return ExtractSubVec(WideSubVec);
5130 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5131 NumElems = WideOpVT.getVectorNumElements();
5132 unsigned ShiftLeft = NumElems - SubVecNumElems;
5133 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5134 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5135 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5136 Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5137 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5138 return ExtractSubVec(Vec);
5142 // Zero lower bits of the Vec
5143 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5144 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5145 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5146 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5147 // Merge them together, SubVec should be zero extended.
5148 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5149 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5151 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5152 return ExtractSubVec(Vec);
5155 // Simple case when we put subvector in the upper part
5156 if (IdxVal + SubVecNumElems == NumElems) {
5157 // Zero upper bits of the Vec
5158 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5159 DAG.getConstant(IdxVal, dl, MVT::i8));
5160 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5161 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5162 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5163 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5164 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5165 return ExtractSubVec(Vec);
5167 // Subvector should be inserted in the middle - use shuffle
5168 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5170 SmallVector<int, 64> Mask;
5171 for (unsigned i = 0; i < NumElems; ++i)
5172 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5174 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5177 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5178 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5179 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5180 /// large BUILD_VECTORS.
5181 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5182 unsigned NumElems, SelectionDAG &DAG,
5184 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5185 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5188 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5189 unsigned NumElems, SelectionDAG &DAG,
5191 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5192 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5195 /// Returns a vector of specified type with all bits set.
5196 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5197 /// Then bitcast to their original type, ensuring they get CSE'd.
5198 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5199 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5200 "Expected a 128/256/512-bit vector type");
5202 APInt Ones = APInt::getAllOnesValue(32);
5203 unsigned NumElts = VT.getSizeInBits() / 32;
5204 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5205 return DAG.getBitcast(VT, Vec);
5208 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5209 SelectionDAG &DAG) {
5210 EVT InVT = In.getValueType();
5211 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5213 if (VT.is128BitVector() && InVT.is128BitVector())
5214 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5215 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5217 // For 256-bit vectors, we only need the lower (128-bit) input half.
5218 // For 512-bit vectors, we only need the lower input half or quarter.
5219 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5220 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5221 In = extractSubVector(In, 0, DAG, DL,
5222 std::max(128, (int)VT.getSizeInBits() / Scale));
5225 return DAG.getNode(Opc, DL, VT, In);
5228 /// Generate unpacklo/unpackhi shuffle mask.
5229 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5231 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5232 int NumElts = VT.getVectorNumElements();
5233 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5235 for (int i = 0; i < NumElts; ++i) {
5236 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5237 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5238 Pos += (Unary ? 0 : NumElts * (i % 2));
5239 Pos += (Lo ? 0 : NumEltsInLane / 2);
5240 Mask.push_back(Pos);
5244 /// Returns a vector_shuffle node for an unpackl operation.
5245 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5246 SDValue V1, SDValue V2) {
5247 SmallVector<int, 8> Mask;
5248 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5249 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5252 /// Returns a vector_shuffle node for an unpackh operation.
5253 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5254 SDValue V1, SDValue V2) {
5255 SmallVector<int, 8> Mask;
5256 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5257 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5260 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5261 /// This produces a shuffle where the low element of V2 is swizzled into the
5262 /// zero/undef vector, landing at element Idx.
5263 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5264 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5266 const X86Subtarget &Subtarget,
5267 SelectionDAG &DAG) {
5268 MVT VT = V2.getSimpleValueType();
5270 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5271 int NumElems = VT.getVectorNumElements();
5272 SmallVector<int, 16> MaskVec(NumElems);
5273 for (int i = 0; i != NumElems; ++i)
5274 // If this is the insertion idx, put the low elt of V2 here.
5275 MaskVec[i] = (i == Idx) ? NumElems : i;
5276 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5279 static SDValue peekThroughBitcasts(SDValue V) {
5280 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5281 V = V.getOperand(0);
5285 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5286 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5287 V.getOperand(0).hasOneUse())
5288 V = V.getOperand(0);
5292 static const Constant *getTargetConstantFromNode(SDValue Op) {
5293 Op = peekThroughBitcasts(Op);
5295 auto *Load = dyn_cast<LoadSDNode>(Op);
5299 SDValue Ptr = Load->getBasePtr();
5300 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5301 Ptr->getOpcode() == X86ISD::WrapperRIP)
5302 Ptr = Ptr->getOperand(0);
5304 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5305 if (!CNode || CNode->isMachineConstantPoolEntry())
5308 return dyn_cast<Constant>(CNode->getConstVal());
5311 // Extract raw constant bits from constant pools.
5312 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5314 SmallVectorImpl<APInt> &EltBits,
5315 bool AllowWholeUndefs = true,
5316 bool AllowPartialUndefs = true) {
5317 assert(EltBits.empty() && "Expected an empty EltBits vector");
5319 Op = peekThroughBitcasts(Op);
5321 EVT VT = Op.getValueType();
5322 unsigned SizeInBits = VT.getSizeInBits();
5323 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5324 unsigned NumElts = SizeInBits / EltSizeInBits;
5326 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5327 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5329 // Extract all the undef/constant element data and pack into single bitsets.
5330 APInt UndefBits(SizeInBits, 0);
5331 APInt MaskBits(SizeInBits, 0);
5333 // Split the undef/constant single bitset data into the target elements.
5334 auto SplitBitData = [&]() {
5335 // Don't split if we don't allow undef bits.
5336 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5337 if (UndefBits.getBoolValue() && !AllowUndefs)
5340 UndefElts = APInt(NumElts, 0);
5341 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5343 for (unsigned i = 0; i != NumElts; ++i) {
5344 unsigned BitOffset = i * EltSizeInBits;
5345 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5347 // Only treat an element as UNDEF if all bits are UNDEF.
5348 if (UndefEltBits.isAllOnesValue()) {
5349 if (!AllowWholeUndefs)
5351 UndefElts.setBit(i);
5355 // If only some bits are UNDEF then treat them as zero (or bail if not
5357 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5360 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5361 EltBits[i] = Bits.getZExtValue();
5366 // Collect constant bits and insert into mask/undef bit masks.
5367 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5368 unsigned BitOffset) {
5371 if (isa<UndefValue>(Cst)) {
5372 unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
5373 Undefs.setBits(BitOffset, BitOffset + CstSizeInBits);
5376 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5377 Mask.insertBits(CInt->getValue(), BitOffset);
5380 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5381 Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset);
5387 // Extract constant bits from build vector.
5388 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5389 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5390 const SDValue &Src = Op.getOperand(i);
5391 unsigned BitOffset = i * SrcEltSizeInBits;
5392 if (Src.isUndef()) {
5393 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5396 auto *Cst = cast<ConstantSDNode>(Src);
5397 APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5398 MaskBits.insertBits(Bits, BitOffset);
5400 return SplitBitData();
5403 // Extract constant bits from constant pool vector.
5404 if (auto *Cst = getTargetConstantFromNode(Op)) {
5405 Type *CstTy = Cst->getType();
5406 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5409 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5410 for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i)
5411 if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits,
5412 i * CstEltSizeInBits))
5415 return SplitBitData();
5418 // Extract constant bits from a broadcasted constant pool scalar.
5419 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5420 EltSizeInBits <= SrcEltSizeInBits) {
5421 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5422 APInt Bits(SizeInBits, 0);
5423 APInt Undefs(SizeInBits, 0);
5424 if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) {
5425 for (unsigned i = 0; i != NumSrcElts; ++i) {
5426 MaskBits |= Bits.shl(i * SrcEltSizeInBits);
5427 UndefBits |= Undefs.shl(i * SrcEltSizeInBits);
5429 return SplitBitData();
5434 // Extract a rematerialized scalar constant insertion.
5435 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5436 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5437 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5438 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5439 MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5440 MaskBits = MaskBits.zext(SizeInBits);
5441 return SplitBitData();
5447 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5448 unsigned MaskEltSizeInBits,
5449 SmallVectorImpl<uint64_t> &RawMask) {
5451 SmallVector<APInt, 64> EltBits;
5453 // Extract the raw target constant bits.
5454 // FIXME: We currently don't support UNDEF bits or mask entries.
5455 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5456 EltBits, /* AllowWholeUndefs */ false,
5457 /* AllowPartialUndefs */ false))
5460 // Insert the extracted elements into the mask.
5461 for (APInt Elt : EltBits)
5462 RawMask.push_back(Elt.getZExtValue());
5467 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5468 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5469 /// operands in \p Ops, and returns true.
5470 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5471 /// IsUnary for shuffles which use a single input multiple times, and in those
5472 /// cases it will adjust the mask to only have indices within that single input.
5473 /// It is an error to call this with non-empty Mask/Ops vectors.
5474 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5475 SmallVectorImpl<SDValue> &Ops,
5476 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5477 unsigned NumElems = VT.getVectorNumElements();
5480 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5481 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5484 bool IsFakeUnary = false;
5485 switch(N->getOpcode()) {
5486 case X86ISD::BLENDI:
5487 ImmN = N->getOperand(N->getNumOperands()-1);
5488 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5489 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5492 ImmN = N->getOperand(N->getNumOperands()-1);
5493 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5494 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5496 case X86ISD::INSERTPS:
5497 ImmN = N->getOperand(N->getNumOperands()-1);
5498 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5499 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5501 case X86ISD::UNPCKH:
5502 DecodeUNPCKHMask(VT, Mask);
5503 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5505 case X86ISD::UNPCKL:
5506 DecodeUNPCKLMask(VT, Mask);
5507 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5509 case X86ISD::MOVHLPS:
5510 DecodeMOVHLPSMask(NumElems, Mask);
5511 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5513 case X86ISD::MOVLHPS:
5514 DecodeMOVLHPSMask(NumElems, Mask);
5515 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5517 case X86ISD::PALIGNR:
5518 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5519 ImmN = N->getOperand(N->getNumOperands()-1);
5520 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5521 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5522 Ops.push_back(N->getOperand(1));
5523 Ops.push_back(N->getOperand(0));
5525 case X86ISD::VSHLDQ:
5526 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5527 ImmN = N->getOperand(N->getNumOperands() - 1);
5528 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5531 case X86ISD::VSRLDQ:
5532 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5533 ImmN = N->getOperand(N->getNumOperands() - 1);
5534 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5537 case X86ISD::PSHUFD:
5538 case X86ISD::VPERMILPI:
5539 ImmN = N->getOperand(N->getNumOperands()-1);
5540 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5543 case X86ISD::PSHUFHW:
5544 ImmN = N->getOperand(N->getNumOperands()-1);
5545 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5548 case X86ISD::PSHUFLW:
5549 ImmN = N->getOperand(N->getNumOperands()-1);
5550 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5553 case X86ISD::VZEXT_MOVL:
5554 DecodeZeroMoveLowMask(VT, Mask);
5557 case X86ISD::VBROADCAST: {
5558 SDValue N0 = N->getOperand(0);
5559 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5560 // add the pre-extracted value to the Ops vector.
5561 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5562 N0.getOperand(0).getValueType() == VT &&
5563 N0.getConstantOperandVal(1) == 0)
5564 Ops.push_back(N0.getOperand(0));
5566 // We only decode broadcasts of same-sized vectors, unless the broadcast
5567 // came from an extract from the original width. If we found one, we
5568 // pushed it the Ops vector above.
5569 if (N0.getValueType() == VT || !Ops.empty()) {
5570 DecodeVectorBroadcast(VT, Mask);
5576 case X86ISD::VPERMILPV: {
5578 SDValue MaskNode = N->getOperand(1);
5579 unsigned MaskEltSize = VT.getScalarSizeInBits();
5580 SmallVector<uint64_t, 32> RawMask;
5581 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5582 DecodeVPERMILPMask(VT, RawMask, Mask);
5585 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5586 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5591 case X86ISD::PSHUFB: {
5593 SDValue MaskNode = N->getOperand(1);
5594 SmallVector<uint64_t, 32> RawMask;
5595 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5596 DecodePSHUFBMask(RawMask, Mask);
5599 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5600 DecodePSHUFBMask(C, Mask);
5605 case X86ISD::VPERMI:
5606 ImmN = N->getOperand(N->getNumOperands()-1);
5607 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5612 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5614 case X86ISD::VPERM2X128:
5615 ImmN = N->getOperand(N->getNumOperands()-1);
5616 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5617 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5619 case X86ISD::MOVSLDUP:
5620 DecodeMOVSLDUPMask(VT, Mask);
5623 case X86ISD::MOVSHDUP:
5624 DecodeMOVSHDUPMask(VT, Mask);
5627 case X86ISD::MOVDDUP:
5628 DecodeMOVDDUPMask(VT, Mask);
5631 case X86ISD::MOVLHPD:
5632 case X86ISD::MOVLPD:
5633 case X86ISD::MOVLPS:
5634 // Not yet implemented
5636 case X86ISD::VPERMIL2: {
5637 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5638 unsigned MaskEltSize = VT.getScalarSizeInBits();
5639 SDValue MaskNode = N->getOperand(2);
5640 SDValue CtrlNode = N->getOperand(3);
5641 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5642 unsigned CtrlImm = CtrlOp->getZExtValue();
5643 SmallVector<uint64_t, 32> RawMask;
5644 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5645 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5648 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5649 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5655 case X86ISD::VPPERM: {
5656 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5657 SDValue MaskNode = N->getOperand(2);
5658 SmallVector<uint64_t, 32> RawMask;
5659 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5660 DecodeVPPERMMask(RawMask, Mask);
5663 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5664 DecodeVPPERMMask(C, Mask);
5669 case X86ISD::VPERMV: {
5671 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5672 Ops.push_back(N->getOperand(1));
5673 SDValue MaskNode = N->getOperand(0);
5674 SmallVector<uint64_t, 32> RawMask;
5675 unsigned MaskEltSize = VT.getScalarSizeInBits();
5676 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5677 DecodeVPERMVMask(RawMask, Mask);
5680 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5681 DecodeVPERMVMask(C, MaskEltSize, Mask);
5686 case X86ISD::VPERMV3: {
5687 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5688 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5689 Ops.push_back(N->getOperand(0));
5690 Ops.push_back(N->getOperand(2));
5691 SDValue MaskNode = N->getOperand(1);
5692 unsigned MaskEltSize = VT.getScalarSizeInBits();
5693 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5694 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5699 case X86ISD::VPERMIV3: {
5700 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5701 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5702 Ops.push_back(N->getOperand(1));
5703 Ops.push_back(N->getOperand(2));
5704 SDValue MaskNode = N->getOperand(0);
5705 unsigned MaskEltSize = VT.getScalarSizeInBits();
5706 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5707 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5712 default: llvm_unreachable("unknown target shuffle node");
5715 // Empty mask indicates the decode failed.
5719 // Check if we're getting a shuffle mask with zero'd elements.
5720 if (!AllowSentinelZero)
5721 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5724 // If we have a fake unary shuffle, the shuffle mask is spread across two
5725 // inputs that are actually the same node. Re-map the mask to always point
5726 // into the first input.
5729 if (M >= (int)Mask.size())
5732 // If we didn't already add operands in the opcode-specific code, default to
5733 // adding 1 or 2 operands starting at 0.
5735 Ops.push_back(N->getOperand(0));
5736 if (!IsUnary || IsFakeUnary)
5737 Ops.push_back(N->getOperand(1));
5743 /// Check a target shuffle mask's inputs to see if we can set any values to
5744 /// SM_SentinelZero - this is for elements that are known to be zero
5745 /// (not just zeroable) from their inputs.
5746 /// Returns true if the target shuffle mask was decoded.
5747 static bool setTargetShuffleZeroElements(SDValue N,
5748 SmallVectorImpl<int> &Mask,
5749 SmallVectorImpl<SDValue> &Ops) {
5751 if (!isTargetShuffle(N.getOpcode()))
5754 MVT VT = N.getSimpleValueType();
5755 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5758 SDValue V1 = Ops[0];
5759 SDValue V2 = IsUnary ? V1 : Ops[1];
5761 V1 = peekThroughBitcasts(V1);
5762 V2 = peekThroughBitcasts(V2);
5764 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5765 "Illegal split of shuffle value type");
5766 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5768 // Extract known constant input data.
5769 APInt UndefSrcElts[2];
5770 SmallVector<APInt, 32> SrcEltBits[2];
5771 bool IsSrcConstant[2] = {
5772 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5773 SrcEltBits[0], true, false),
5774 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5775 SrcEltBits[1], true, false)};
5777 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5780 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5784 // Determine shuffle input and normalize the mask.
5785 unsigned SrcIdx = M / Size;
5786 SDValue V = M < Size ? V1 : V2;
5789 // We are referencing an UNDEF input.
5791 Mask[i] = SM_SentinelUndef;
5795 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5796 // TODO: We currently only set UNDEF for integer types - floats use the same
5797 // registers as vectors and many of the scalar folded loads rely on the
5798 // SCALAR_TO_VECTOR pattern.
5799 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5800 (Size % V.getValueType().getVectorNumElements()) == 0) {
5801 int Scale = Size / V.getValueType().getVectorNumElements();
5802 int Idx = M / Scale;
5803 if (Idx != 0 && !VT.isFloatingPoint())
5804 Mask[i] = SM_SentinelUndef;
5805 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5806 Mask[i] = SM_SentinelZero;
5810 // Attempt to extract from the source's constant bits.
5811 if (IsSrcConstant[SrcIdx]) {
5812 if (UndefSrcElts[SrcIdx][M])
5813 Mask[i] = SM_SentinelUndef;
5814 else if (SrcEltBits[SrcIdx][M] == 0)
5815 Mask[i] = SM_SentinelZero;
5819 assert(VT.getVectorNumElements() == Mask.size() &&
5820 "Different mask size from vector size!");
5824 // Attempt to decode ops that could be represented as a shuffle mask.
5825 // The decoded shuffle mask may contain a different number of elements to the
5826 // destination value type.
5827 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5828 SmallVectorImpl<SDValue> &Ops) {
5832 MVT VT = N.getSimpleValueType();
5833 unsigned NumElts = VT.getVectorNumElements();
5834 unsigned NumSizeInBits = VT.getSizeInBits();
5835 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5836 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5837 "Expected byte aligned value types");
5839 unsigned Opcode = N.getOpcode();
5842 case X86ISD::ANDNP: {
5843 // Attempt to decode as a per-byte mask.
5845 SmallVector<APInt, 32> EltBits;
5846 SDValue N0 = N.getOperand(0);
5847 SDValue N1 = N.getOperand(1);
5848 bool IsAndN = (X86ISD::ANDNP == Opcode);
5849 uint64_t ZeroMask = IsAndN ? 255 : 0;
5850 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5852 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5854 Mask.push_back(SM_SentinelUndef);
5857 uint64_t ByteBits = EltBits[i].getZExtValue();
5858 if (ByteBits != 0 && ByteBits != 255)
5860 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5862 Ops.push_back(IsAndN ? N1 : N0);
5865 case ISD::SCALAR_TO_VECTOR: {
5866 // Match against a scalar_to_vector of an extract from a similar vector.
5867 SDValue N0 = N.getOperand(0);
5868 if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5869 N0.getOperand(0).getValueType() != VT ||
5870 !isa<ConstantSDNode>(N0.getOperand(1)) ||
5871 NumElts <= N0.getConstantOperandVal(1) ||
5872 !N->isOnlyUserOf(N0.getNode()))
5874 Ops.push_back(N0.getOperand(0));
5875 Mask.push_back(N0.getConstantOperandVal(1));
5876 Mask.append(NumElts - 1, SM_SentinelUndef);
5879 case X86ISD::PINSRB:
5880 case X86ISD::PINSRW: {
5881 SDValue InVec = N.getOperand(0);
5882 SDValue InScl = N.getOperand(1);
5883 uint64_t InIdx = N.getConstantOperandVal(2);
5884 assert(InIdx < NumElts && "Illegal insertion index");
5886 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
5887 if (X86::isZeroNode(InScl)) {
5888 Ops.push_back(InVec);
5889 for (unsigned i = 0; i != NumElts; ++i)
5890 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
5894 // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
5895 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
5897 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
5898 if (InScl.getOpcode() != ISD::AssertZext ||
5899 InScl.getOperand(0).getOpcode() != ExOp)
5902 SDValue ExVec = InScl.getOperand(0).getOperand(0);
5903 uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
5904 assert(ExIdx < NumElts && "Illegal extraction index");
5905 Ops.push_back(InVec);
5906 Ops.push_back(ExVec);
5907 for (unsigned i = 0; i != NumElts; ++i)
5908 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
5912 case X86ISD::VSRLI: {
5913 uint64_t ShiftVal = N.getConstantOperandVal(1);
5914 // Out of range bit shifts are guaranteed to be zero.
5915 if (NumBitsPerElt <= ShiftVal) {
5916 Mask.append(NumElts, SM_SentinelZero);
5920 // We can only decode 'whole byte' bit shifts as shuffles.
5921 if ((ShiftVal % 8) != 0)
5924 uint64_t ByteShift = ShiftVal / 8;
5925 unsigned NumBytes = NumSizeInBits / 8;
5926 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5927 Ops.push_back(N.getOperand(0));
5929 // Clear mask to all zeros and insert the shifted byte indices.
5930 Mask.append(NumBytes, SM_SentinelZero);
5932 if (X86ISD::VSHLI == Opcode) {
5933 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5934 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5935 Mask[i + j] = i + j - ByteShift;
5937 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5938 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5939 Mask[i + j - ByteShift] = i + j;
5943 case ISD::ZERO_EXTEND_VECTOR_INREG:
5944 case X86ISD::VZEXT: {
5945 // TODO - add support for VPMOVZX with smaller input vector types.
5946 SDValue Src = N.getOperand(0);
5947 MVT SrcVT = Src.getSimpleValueType();
5948 if (NumSizeInBits != SrcVT.getSizeInBits())
5950 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
5959 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
5960 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
5961 SmallVectorImpl<int> &Mask) {
5962 int MaskWidth = Mask.size();
5963 SmallVector<SDValue, 16> UsedInputs;
5964 for (int i = 0, e = Inputs.size(); i < e; ++i) {
5965 int lo = UsedInputs.size() * MaskWidth;
5966 int hi = lo + MaskWidth;
5967 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
5968 UsedInputs.push_back(Inputs[i]);
5975 Inputs = UsedInputs;
5978 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5979 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5980 /// remaining input indices in case we now have a unary shuffle and adjust the
5981 /// inputs accordingly.
5982 /// Returns true if the target shuffle mask was decoded.
5983 static bool resolveTargetShuffleInputs(SDValue Op,
5984 SmallVectorImpl<SDValue> &Inputs,
5985 SmallVectorImpl<int> &Mask) {
5986 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
5987 if (!getFauxShuffleMask(Op, Mask, Inputs))
5990 resolveTargetShuffleInputsAndMask(Inputs, Mask);
5994 /// Returns the scalar element that will make up the ith
5995 /// element of the result of the vector shuffle.
5996 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5999 return SDValue(); // Limit search depth.
6001 SDValue V = SDValue(N, 0);
6002 EVT VT = V.getValueType();
6003 unsigned Opcode = V.getOpcode();
6005 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6006 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6007 int Elt = SV->getMaskElt(Index);
6010 return DAG.getUNDEF(VT.getVectorElementType());
6012 unsigned NumElems = VT.getVectorNumElements();
6013 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6014 : SV->getOperand(1);
6015 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6018 // Recurse into target specific vector shuffles to find scalars.
6019 if (isTargetShuffle(Opcode)) {
6020 MVT ShufVT = V.getSimpleValueType();
6021 MVT ShufSVT = ShufVT.getVectorElementType();
6022 int NumElems = (int)ShufVT.getVectorNumElements();
6023 SmallVector<int, 16> ShuffleMask;
6024 SmallVector<SDValue, 16> ShuffleOps;
6027 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6030 int Elt = ShuffleMask[Index];
6031 if (Elt == SM_SentinelZero)
6032 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6033 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6034 if (Elt == SM_SentinelUndef)
6035 return DAG.getUNDEF(ShufSVT);
6037 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6038 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6039 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6043 // Actual nodes that may contain scalar elements
6044 if (Opcode == ISD::BITCAST) {
6045 V = V.getOperand(0);
6046 EVT SrcVT = V.getValueType();
6047 unsigned NumElems = VT.getVectorNumElements();
6049 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6053 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6054 return (Index == 0) ? V.getOperand(0)
6055 : DAG.getUNDEF(VT.getVectorElementType());
6057 if (V.getOpcode() == ISD::BUILD_VECTOR)
6058 return V.getOperand(Index);
6063 /// Custom lower build_vector of v16i8.
6064 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6065 unsigned NumNonZero, unsigned NumZero,
6067 const X86Subtarget &Subtarget) {
6068 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6075 // SSE4.1 - use PINSRB to insert each byte directly.
6076 if (Subtarget.hasSSE41()) {
6077 for (unsigned i = 0; i < 16; ++i) {
6078 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6080 // If the build vector contains zeros or our first insertion is not the
6081 // first index then insert into zero vector to break any register
6082 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6085 if (NumZero || 0 != i)
6086 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6088 assert(0 == i && "Expected insertion into zero-index");
6089 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6090 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6091 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6092 V = DAG.getBitcast(MVT::v16i8, V);
6096 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6097 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6104 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6105 for (unsigned i = 0; i < 16; ++i) {
6106 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6107 if (ThisIsNonZero && First) {
6109 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6111 V = DAG.getUNDEF(MVT::v8i16);
6116 // FIXME: Investigate extending to i32 instead of just i16.
6117 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6118 SDValue ThisElt, LastElt;
6119 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6120 if (LastIsNonZero) {
6122 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6124 if (ThisIsNonZero) {
6125 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6126 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6127 DAG.getConstant(8, dl, MVT::i8));
6129 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6135 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6136 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6137 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6138 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6139 V = DAG.getBitcast(MVT::v8i16, V);
6141 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6142 DAG.getIntPtrConstant(i / 2, dl));
6148 return DAG.getBitcast(MVT::v16i8, V);
6151 /// Custom lower build_vector of v8i16.
6152 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6153 unsigned NumNonZero, unsigned NumZero,
6155 const X86Subtarget &Subtarget) {
6156 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6162 for (unsigned i = 0; i < 8; ++i) {
6163 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6165 // If the build vector contains zeros or our first insertion is not the
6166 // first index then insert into zero vector to break any register
6167 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6170 if (NumZero || 0 != i)
6171 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6173 assert(0 == i && "Expected insertion into zero-index");
6174 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6175 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6176 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6177 V = DAG.getBitcast(MVT::v8i16, V);
6181 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6182 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6189 /// Custom lower build_vector of v4i32 or v4f32.
6190 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6191 const X86Subtarget &Subtarget) {
6192 // Find all zeroable elements.
6193 std::bitset<4> Zeroable;
6194 for (int i=0; i < 4; ++i) {
6195 SDValue Elt = Op->getOperand(i);
6196 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6198 assert(Zeroable.size() - Zeroable.count() > 1 &&
6199 "We expect at least two non-zero elements!");
6201 // We only know how to deal with build_vector nodes where elements are either
6202 // zeroable or extract_vector_elt with constant index.
6203 SDValue FirstNonZero;
6204 unsigned FirstNonZeroIdx;
6205 for (unsigned i=0; i < 4; ++i) {
6208 SDValue Elt = Op->getOperand(i);
6209 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6210 !isa<ConstantSDNode>(Elt.getOperand(1)))
6212 // Make sure that this node is extracting from a 128-bit vector.
6213 MVT VT = Elt.getOperand(0).getSimpleValueType();
6214 if (!VT.is128BitVector())
6216 if (!FirstNonZero.getNode()) {
6218 FirstNonZeroIdx = i;
6222 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6223 SDValue V1 = FirstNonZero.getOperand(0);
6224 MVT VT = V1.getSimpleValueType();
6226 // See if this build_vector can be lowered as a blend with zero.
6228 unsigned EltMaskIdx, EltIdx;
6230 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6231 if (Zeroable[EltIdx]) {
6232 // The zero vector will be on the right hand side.
6233 Mask[EltIdx] = EltIdx+4;
6237 Elt = Op->getOperand(EltIdx);
6238 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6239 EltMaskIdx = Elt.getConstantOperandVal(1);
6240 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6242 Mask[EltIdx] = EltIdx;
6246 // Let the shuffle legalizer deal with blend operations.
6247 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6248 if (V1.getSimpleValueType() != VT)
6249 V1 = DAG.getBitcast(VT, V1);
6250 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6253 // See if we can lower this build_vector to a INSERTPS.
6254 if (!Subtarget.hasSSE41())
6257 SDValue V2 = Elt.getOperand(0);
6258 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6261 bool CanFold = true;
6262 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6266 SDValue Current = Op->getOperand(i);
6267 SDValue SrcVector = Current->getOperand(0);
6270 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6276 assert(V1.getNode() && "Expected at least two non-zero elements!");
6277 if (V1.getSimpleValueType() != MVT::v4f32)
6278 V1 = DAG.getBitcast(MVT::v4f32, V1);
6279 if (V2.getSimpleValueType() != MVT::v4f32)
6280 V2 = DAG.getBitcast(MVT::v4f32, V2);
6282 // Ok, we can emit an INSERTPS instruction.
6283 unsigned ZMask = Zeroable.to_ulong();
6285 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6286 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6288 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6289 DAG.getIntPtrConstant(InsertPSMask, DL));
6290 return DAG.getBitcast(VT, Result);
6293 /// Return a vector logical shift node.
6294 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6295 SelectionDAG &DAG, const TargetLowering &TLI,
6297 assert(VT.is128BitVector() && "Unknown type for VShift");
6298 MVT ShVT = MVT::v16i8;
6299 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6300 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6301 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6302 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6303 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6304 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6307 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6308 SelectionDAG &DAG) {
6310 // Check if the scalar load can be widened into a vector load. And if
6311 // the address is "base + cst" see if the cst can be "absorbed" into
6312 // the shuffle mask.
6313 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6314 SDValue Ptr = LD->getBasePtr();
6315 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6317 EVT PVT = LD->getValueType(0);
6318 if (PVT != MVT::i32 && PVT != MVT::f32)
6323 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6324 FI = FINode->getIndex();
6326 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6327 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6328 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6329 Offset = Ptr.getConstantOperandVal(1);
6330 Ptr = Ptr.getOperand(0);
6335 // FIXME: 256-bit vector instructions don't require a strict alignment,
6336 // improve this code to support it better.
6337 unsigned RequiredAlign = VT.getSizeInBits()/8;
6338 SDValue Chain = LD->getChain();
6339 // Make sure the stack object alignment is at least 16 or 32.
6340 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6341 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6342 if (MFI.isFixedObjectIndex(FI)) {
6343 // Can't change the alignment. FIXME: It's possible to compute
6344 // the exact stack offset and reference FI + adjust offset instead.
6345 // If someone *really* cares about this. That's the way to implement it.
6348 MFI.setObjectAlignment(FI, RequiredAlign);
6352 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6353 // Ptr + (Offset & ~15).
6356 if ((Offset % RequiredAlign) & 3)
6358 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6361 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6362 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6365 int EltNo = (Offset - StartOffset) >> 2;
6366 unsigned NumElems = VT.getVectorNumElements();
6368 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6369 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6370 LD->getPointerInfo().getWithOffset(StartOffset));
6372 SmallVector<int, 8> Mask(NumElems, EltNo);
6374 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6380 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6381 /// elements can be replaced by a single large load which has the same value as
6382 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6384 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6385 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6386 const SDLoc &DL, SelectionDAG &DAG,
6387 bool isAfterLegalize) {
6388 unsigned NumElems = Elts.size();
6390 int LastLoadedElt = -1;
6391 SmallBitVector LoadMask(NumElems, false);
6392 SmallBitVector ZeroMask(NumElems, false);
6393 SmallBitVector UndefMask(NumElems, false);
6395 // For each element in the initializer, see if we've found a load, zero or an
6397 for (unsigned i = 0; i < NumElems; ++i) {
6398 SDValue Elt = peekThroughBitcasts(Elts[i]);
6403 UndefMask[i] = true;
6404 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6406 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6409 // Each loaded element must be the correct fractional portion of the
6410 // requested vector load.
6411 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6416 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6417 "Incomplete element masks");
6419 // Handle Special Cases - all undef or undef/zero.
6420 if (UndefMask.count() == NumElems)
6421 return DAG.getUNDEF(VT);
6423 // FIXME: Should we return this as a BUILD_VECTOR instead?
6424 if ((ZeroMask | UndefMask).count() == NumElems)
6425 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6426 : DAG.getConstantFP(0.0, DL, VT);
6428 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6429 int FirstLoadedElt = LoadMask.find_first();
6430 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6431 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6432 EVT LDBaseVT = EltBase.getValueType();
6434 // Consecutive loads can contain UNDEFS but not ZERO elements.
6435 // Consecutive loads with UNDEFs and ZEROs elements require a
6436 // an additional shuffle stage to clear the ZERO elements.
6437 bool IsConsecutiveLoad = true;
6438 bool IsConsecutiveLoadWithZeros = true;
6439 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6441 SDValue Elt = peekThroughBitcasts(Elts[i]);
6442 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6443 if (!DAG.areNonVolatileConsecutiveLoads(
6444 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6445 i - FirstLoadedElt)) {
6446 IsConsecutiveLoad = false;
6447 IsConsecutiveLoadWithZeros = false;
6450 } else if (ZeroMask[i]) {
6451 IsConsecutiveLoad = false;
6455 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6456 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6457 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6458 "Cannot merge volatile loads.");
6460 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6461 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6463 if (LDBase->hasAnyUseOfValue(1)) {
6465 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6466 SDValue(NewLd.getNode(), 1));
6467 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6468 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6469 SDValue(NewLd.getNode(), 1));
6475 // LOAD - all consecutive load/undefs (must start/end with a load).
6476 // If we have found an entire vector of loads and undefs, then return a large
6477 // load of the entire vector width starting at the base pointer.
6478 // If the vector contains zeros, then attempt to shuffle those elements.
6479 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6480 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6481 assert(LDBase && "Did not find base load for merging consecutive loads");
6482 EVT EltVT = LDBase->getValueType(0);
6483 // Ensure that the input vector size for the merged loads matches the
6484 // cumulative size of the input elements.
6485 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6488 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6491 if (IsConsecutiveLoad)
6492 return CreateLoad(VT, LDBase);
6494 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6495 // vector and a zero vector to clear out the zero elements.
6496 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6497 SmallVector<int, 4> ClearMask(NumElems, -1);
6498 for (unsigned i = 0; i < NumElems; ++i) {
6500 ClearMask[i] = i + NumElems;
6501 else if (LoadMask[i])
6504 SDValue V = CreateLoad(VT, LDBase);
6505 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6506 : DAG.getConstantFP(0.0, DL, VT);
6507 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6512 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6514 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6515 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6516 (LoadSize == 32 || LoadSize == 64) &&
6517 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6518 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6519 : MVT::getIntegerVT(LoadSize);
6520 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6521 if (TLI.isTypeLegal(VecVT)) {
6522 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6523 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6525 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6526 LDBase->getPointerInfo(),
6527 LDBase->getAlignment(),
6528 false/*isVolatile*/, true/*ReadMem*/,
6531 // Make sure the newly-created LOAD is in the same position as LDBase in
6532 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
6533 // and update uses of LDBase's output chain to use the TokenFactor.
6534 if (LDBase->hasAnyUseOfValue(1)) {
6536 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6537 SDValue(ResNode.getNode(), 1));
6538 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6539 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6540 SDValue(ResNode.getNode(), 1));
6543 return DAG.getBitcast(VT, ResNode);
6550 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6551 unsigned SplatBitSize, LLVMContext &C) {
6552 unsigned ScalarSize = VT.getScalarSizeInBits();
6553 unsigned NumElm = SplatBitSize / ScalarSize;
6555 SmallVector<Constant *, 32> ConstantVec;
6556 for (unsigned i = 0; i < NumElm; i++) {
6557 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6559 if (VT.isFloatingPoint()) {
6560 assert((ScalarSize == 32 || ScalarSize == 64) &&
6561 "Unsupported floating point scalar size");
6562 if (ScalarSize == 32)
6563 Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
6565 Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
6567 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6568 ConstantVec.push_back(Const);
6570 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6573 static bool isUseOfShuffle(SDNode *N) {
6574 for (auto *U : N->uses()) {
6575 if (isTargetShuffle(U->getOpcode()))
6577 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6578 return isUseOfShuffle(U);
6583 /// Attempt to use the vbroadcast instruction to generate a splat value
6584 /// from a splat BUILD_VECTOR which uses:
6585 /// a. A single scalar load, or a constant.
6586 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6588 /// The VBROADCAST node is returned when a pattern is found,
6589 /// or SDValue() otherwise.
6590 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6591 const X86Subtarget &Subtarget,
6592 SelectionDAG &DAG) {
6593 // VBROADCAST requires AVX.
6594 // TODO: Splats could be generated for non-AVX CPUs using SSE
6595 // instructions, but there's less potential gain for only 128-bit vectors.
6596 if (!Subtarget.hasAVX())
6599 MVT VT = BVOp->getSimpleValueType(0);
6602 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6603 "Unsupported vector type for broadcast.");
6605 BitVector UndefElements;
6606 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6608 // We need a splat of a single value to use broadcast, and it doesn't
6609 // make any sense if the value is only in one element of the vector.
6610 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6611 APInt SplatValue, Undef;
6612 unsigned SplatBitSize;
6614 // Check if this is a repeated constant pattern suitable for broadcasting.
6615 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6616 SplatBitSize > VT.getScalarSizeInBits() &&
6617 SplatBitSize < VT.getSizeInBits()) {
6618 // Avoid replacing with broadcast when it's a use of a shuffle
6619 // instruction to preserve the present custom lowering of shuffles.
6620 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6622 // replace BUILD_VECTOR with broadcast of the repeated constants.
6623 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6624 LLVMContext *Ctx = DAG.getContext();
6625 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6626 if (Subtarget.hasAVX()) {
6627 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6628 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6629 // Splatted value can fit in one INTEGER constant in constant pool.
6630 // Load the constant and broadcast it.
6631 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6632 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6633 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6634 SDValue CP = DAG.getConstantPool(C, PVT);
6635 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6637 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6639 CVT, dl, DAG.getEntryNode(), CP,
6640 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6642 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6643 MVT::getVectorVT(CVT, Repeat), Ld);
6644 return DAG.getBitcast(VT, Brdcst);
6645 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6646 // Splatted value can fit in one FLOAT constant in constant pool.
6647 // Load the constant and broadcast it.
6648 // AVX have support for 32 and 64 bit broadcast for floats only.
6649 // No 64bit integer in 32bit subtarget.
6650 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6651 Constant *C = SplatBitSize == 32
6652 ? ConstantFP::get(Type::getFloatTy(*Ctx),
6653 SplatValue.bitsToFloat())
6654 : ConstantFP::get(Type::getDoubleTy(*Ctx),
6655 SplatValue.bitsToDouble());
6656 SDValue CP = DAG.getConstantPool(C, PVT);
6657 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6659 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6661 CVT, dl, DAG.getEntryNode(), CP,
6662 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6664 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6665 MVT::getVectorVT(CVT, Repeat), Ld);
6666 return DAG.getBitcast(VT, Brdcst);
6667 } else if (SplatBitSize > 64) {
6668 // Load the vector of constants and broadcast it.
6669 MVT CVT = VT.getScalarType();
6670 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6672 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6673 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6674 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6676 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6677 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6679 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6680 return DAG.getBitcast(VT, Brdcst);
6687 bool ConstSplatVal =
6688 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6690 // Make sure that all of the users of a non-constant load are from the
6691 // BUILD_VECTOR node.
6692 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6695 unsigned ScalarSize = Ld.getValueSizeInBits();
6696 bool IsGE256 = (VT.getSizeInBits() >= 256);
6698 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6699 // instruction to save 8 or more bytes of constant pool data.
6700 // TODO: If multiple splats are generated to load the same constant,
6701 // it may be detrimental to overall size. There needs to be a way to detect
6702 // that condition to know if this is truly a size win.
6703 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6705 // Handle broadcasting a single constant scalar from the constant pool
6707 // On Sandybridge (no AVX2), it is still better to load a constant vector
6708 // from the constant pool and not to broadcast it from a scalar.
6709 // But override that restriction when optimizing for size.
6710 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6711 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6712 EVT CVT = Ld.getValueType();
6713 assert(!CVT.isVector() && "Must not broadcast a vector type");
6715 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6716 // For size optimization, also splat v2f64 and v2i64, and for size opt
6717 // with AVX2, also splat i8 and i16.
6718 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6719 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6720 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6721 const Constant *C = nullptr;
6722 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6723 C = CI->getConstantIntValue();
6724 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6725 C = CF->getConstantFPValue();
6727 assert(C && "Invalid constant type");
6729 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6731 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6732 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6734 CVT, dl, DAG.getEntryNode(), CP,
6735 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6738 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6742 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6744 // Handle AVX2 in-register broadcasts.
6745 if (!IsLoad && Subtarget.hasInt256() &&
6746 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6747 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6749 // The scalar source must be a normal load.
6753 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6754 (Subtarget.hasVLX() && ScalarSize == 64))
6755 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6757 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6758 // double since there is no vbroadcastsd xmm
6759 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6760 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6761 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6764 // Unsupported broadcast.
6768 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6769 /// underlying vector and index.
6771 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6773 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6775 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6776 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6779 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6781 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6783 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6784 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6787 // In this case the vector is the extract_subvector expression and the index
6788 // is 2, as specified by the shuffle.
6789 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6790 SDValue ShuffleVec = SVOp->getOperand(0);
6791 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6792 assert(ShuffleVecVT.getVectorElementType() ==
6793 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6795 int ShuffleIdx = SVOp->getMaskElt(Idx);
6796 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6797 ExtractedFromVec = ShuffleVec;
6803 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6804 MVT VT = Op.getSimpleValueType();
6806 // Skip if insert_vec_elt is not supported.
6807 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6808 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6812 unsigned NumElems = Op.getNumOperands();
6816 SmallVector<unsigned, 4> InsertIndices;
6817 SmallVector<int, 8> Mask(NumElems, -1);
6819 for (unsigned i = 0; i != NumElems; ++i) {
6820 unsigned Opc = Op.getOperand(i).getOpcode();
6822 if (Opc == ISD::UNDEF)
6825 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6826 // Quit if more than 1 elements need inserting.
6827 if (InsertIndices.size() > 1)
6830 InsertIndices.push_back(i);
6834 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6835 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6837 // Quit if non-constant index.
6838 if (!isa<ConstantSDNode>(ExtIdx))
6840 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6842 // Quit if extracted from vector of different type.
6843 if (ExtractedFromVec.getValueType() != VT)
6846 if (!VecIn1.getNode())
6847 VecIn1 = ExtractedFromVec;
6848 else if (VecIn1 != ExtractedFromVec) {
6849 if (!VecIn2.getNode())
6850 VecIn2 = ExtractedFromVec;
6851 else if (VecIn2 != ExtractedFromVec)
6852 // Quit if more than 2 vectors to shuffle
6856 if (ExtractedFromVec == VecIn1)
6858 else if (ExtractedFromVec == VecIn2)
6859 Mask[i] = Idx + NumElems;
6862 if (!VecIn1.getNode())
6865 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6866 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6868 for (unsigned Idx : InsertIndices)
6869 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6870 DAG.getIntPtrConstant(Idx, DL));
6875 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6876 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6877 Op.getScalarValueSizeInBits() == 1 &&
6878 "Can not convert non-constant vector");
6879 uint64_t Immediate = 0;
6880 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6881 SDValue In = Op.getOperand(idx);
6883 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
6886 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6887 return DAG.getConstant(Immediate, dl, VT);
6889 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6891 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6893 MVT VT = Op.getSimpleValueType();
6894 assert((VT.getVectorElementType() == MVT::i1) &&
6895 "Unexpected type in LowerBUILD_VECTORvXi1!");
6898 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6899 return DAG.getTargetConstant(0, dl, VT);
6901 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6902 return DAG.getTargetConstant(1, dl, VT);
6904 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6905 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6906 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6907 return DAG.getBitcast(VT, Imm);
6908 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6909 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6910 DAG.getIntPtrConstant(0, dl));
6913 // Vector has one or more non-const elements
6914 uint64_t Immediate = 0;
6915 SmallVector<unsigned, 16> NonConstIdx;
6916 bool IsSplat = true;
6917 bool HasConstElts = false;
6919 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6920 SDValue In = Op.getOperand(idx);
6923 if (!isa<ConstantSDNode>(In))
6924 NonConstIdx.push_back(idx);
6926 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
6927 HasConstElts = true;
6931 else if (In != Op.getOperand(SplatIdx))
6935 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6937 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
6938 DAG.getConstant(1, dl, VT),
6939 DAG.getConstant(0, dl, VT));
6941 // insert elements one by one
6945 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6946 Imm = DAG.getConstant(Immediate, dl, ImmVT);
6948 else if (HasConstElts)
6949 Imm = DAG.getConstant(0, dl, VT);
6951 Imm = DAG.getUNDEF(VT);
6952 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6953 DstVec = DAG.getBitcast(VT, Imm);
6955 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6956 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6957 DAG.getIntPtrConstant(0, dl));
6960 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6961 unsigned InsertIdx = NonConstIdx[i];
6962 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6963 Op.getOperand(InsertIdx),
6964 DAG.getIntPtrConstant(InsertIdx, dl));
6969 /// \brief Return true if \p N implements a horizontal binop and return the
6970 /// operands for the horizontal binop into V0 and V1.
6972 /// This is a helper function of LowerToHorizontalOp().
6973 /// This function checks that the build_vector \p N in input implements a
6974 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6975 /// operation to match.
6976 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6977 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6978 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6981 /// This function only analyzes elements of \p N whose indices are
6982 /// in range [BaseIdx, LastIdx).
6983 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6985 unsigned BaseIdx, unsigned LastIdx,
6986 SDValue &V0, SDValue &V1) {
6987 EVT VT = N->getValueType(0);
6989 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6990 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6991 "Invalid Vector in input!");
6993 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6994 bool CanFold = true;
6995 unsigned ExpectedVExtractIdx = BaseIdx;
6996 unsigned NumElts = LastIdx - BaseIdx;
6997 V0 = DAG.getUNDEF(VT);
6998 V1 = DAG.getUNDEF(VT);
7000 // Check if N implements a horizontal binop.
7001 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7002 SDValue Op = N->getOperand(i + BaseIdx);
7005 if (Op->isUndef()) {
7006 // Update the expected vector extract index.
7007 if (i * 2 == NumElts)
7008 ExpectedVExtractIdx = BaseIdx;
7009 ExpectedVExtractIdx += 2;
7013 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7018 SDValue Op0 = Op.getOperand(0);
7019 SDValue Op1 = Op.getOperand(1);
7021 // Try to match the following pattern:
7022 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7023 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7024 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7025 Op0.getOperand(0) == Op1.getOperand(0) &&
7026 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7027 isa<ConstantSDNode>(Op1.getOperand(1)));
7031 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7032 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7034 if (i * 2 < NumElts) {
7036 V0 = Op0.getOperand(0);
7037 if (V0.getValueType() != VT)
7042 V1 = Op0.getOperand(0);
7043 if (V1.getValueType() != VT)
7046 if (i * 2 == NumElts)
7047 ExpectedVExtractIdx = BaseIdx;
7050 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7051 if (I0 == ExpectedVExtractIdx)
7052 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7053 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7054 // Try to match the following dag sequence:
7055 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7056 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7060 ExpectedVExtractIdx += 2;
7066 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7067 /// a concat_vector.
7069 /// This is a helper function of LowerToHorizontalOp().
7070 /// This function expects two 256-bit vectors called V0 and V1.
7071 /// At first, each vector is split into two separate 128-bit vectors.
7072 /// Then, the resulting 128-bit vectors are used to implement two
7073 /// horizontal binary operations.
7075 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7077 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7078 /// the two new horizontal binop.
7079 /// When Mode is set, the first horizontal binop dag node would take as input
7080 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7081 /// horizontal binop dag node would take as input the lower 128-bit of V1
7082 /// and the upper 128-bit of V1.
7084 /// HADD V0_LO, V0_HI
7085 /// HADD V1_LO, V1_HI
7087 /// Otherwise, the first horizontal binop dag node takes as input the lower
7088 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7089 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7091 /// HADD V0_LO, V1_LO
7092 /// HADD V0_HI, V1_HI
7094 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7095 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7096 /// the upper 128-bits of the result.
7097 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7098 const SDLoc &DL, SelectionDAG &DAG,
7099 unsigned X86Opcode, bool Mode,
7100 bool isUndefLO, bool isUndefHI) {
7101 MVT VT = V0.getSimpleValueType();
7102 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7103 "Invalid nodes in input!");
7105 unsigned NumElts = VT.getVectorNumElements();
7106 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7107 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7108 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7109 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7110 MVT NewVT = V0_LO.getSimpleValueType();
7112 SDValue LO = DAG.getUNDEF(NewVT);
7113 SDValue HI = DAG.getUNDEF(NewVT);
7116 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7117 if (!isUndefLO && !V0->isUndef())
7118 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7119 if (!isUndefHI && !V1->isUndef())
7120 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7122 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7123 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7124 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7126 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7127 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7130 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7133 /// Returns true iff \p BV builds a vector with the result equivalent to
7134 /// the result of ADDSUB operation.
7135 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7136 /// are written to the parameters \p Opnd0 and \p Opnd1.
7137 static bool isAddSub(const BuildVectorSDNode *BV,
7138 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7139 SDValue &Opnd0, SDValue &Opnd1) {
7141 MVT VT = BV->getSimpleValueType(0);
7142 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7143 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7144 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7147 unsigned NumElts = VT.getVectorNumElements();
7148 SDValue InVec0 = DAG.getUNDEF(VT);
7149 SDValue InVec1 = DAG.getUNDEF(VT);
7151 // Odd-numbered elements in the input build vector are obtained from
7152 // adding two integer/float elements.
7153 // Even-numbered elements in the input build vector are obtained from
7154 // subtracting two integer/float elements.
7155 unsigned ExpectedOpcode = ISD::FSUB;
7156 unsigned NextExpectedOpcode = ISD::FADD;
7157 bool AddFound = false;
7158 bool SubFound = false;
7160 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7161 SDValue Op = BV->getOperand(i);
7163 // Skip 'undef' values.
7164 unsigned Opcode = Op.getOpcode();
7165 if (Opcode == ISD::UNDEF) {
7166 std::swap(ExpectedOpcode, NextExpectedOpcode);
7170 // Early exit if we found an unexpected opcode.
7171 if (Opcode != ExpectedOpcode)
7174 SDValue Op0 = Op.getOperand(0);
7175 SDValue Op1 = Op.getOperand(1);
7177 // Try to match the following pattern:
7178 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7179 // Early exit if we cannot match that sequence.
7180 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7181 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7182 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7183 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7184 Op0.getOperand(1) != Op1.getOperand(1))
7187 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7191 // We found a valid add/sub node. Update the information accordingly.
7197 // Update InVec0 and InVec1.
7198 if (InVec0.isUndef()) {
7199 InVec0 = Op0.getOperand(0);
7200 if (InVec0.getSimpleValueType() != VT)
7203 if (InVec1.isUndef()) {
7204 InVec1 = Op1.getOperand(0);
7205 if (InVec1.getSimpleValueType() != VT)
7209 // Make sure that operands in input to each add/sub node always
7210 // come from a same pair of vectors.
7211 if (InVec0 != Op0.getOperand(0)) {
7212 if (ExpectedOpcode == ISD::FSUB)
7215 // FADD is commutable. Try to commute the operands
7216 // and then test again.
7217 std::swap(Op0, Op1);
7218 if (InVec0 != Op0.getOperand(0))
7222 if (InVec1 != Op1.getOperand(0))
7225 // Update the pair of expected opcodes.
7226 std::swap(ExpectedOpcode, NextExpectedOpcode);
7229 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7230 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7238 /// Returns true if is possible to fold MUL and an idiom that has already been
7239 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7240 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7241 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7243 /// Prior to calling this function it should be known that there is some
7244 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7245 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7246 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7247 /// of \p Opnd0 uses is expected to be equal to 2.
7248 /// For example, this function may be called for the following IR:
7249 /// %AB = fmul fast <2 x double> %A, %B
7250 /// %Sub = fsub fast <2 x double> %AB, %C
7251 /// %Add = fadd fast <2 x double> %AB, %C
7252 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7253 /// <2 x i32> <i32 0, i32 3>
7254 /// There is a def for %Addsub here, which potentially can be replaced by
7255 /// X86ISD::ADDSUB operation:
7256 /// %Addsub = X86ISD::ADDSUB %AB, %C
7257 /// and such ADDSUB can further be replaced with FMADDSUB:
7258 /// %Addsub = FMADDSUB %A, %B, %C.
7260 /// The main reason why this method is called before the replacement of the
7261 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7262 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7264 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7265 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7266 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7267 !Subtarget.hasAnyFMA())
7270 // FIXME: These checks must match the similar ones in
7271 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7272 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7273 // or MUL + ADDSUB to FMADDSUB.
7274 const TargetOptions &Options = DAG.getTarget().Options;
7276 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7281 Opnd1 = Opnd0.getOperand(1);
7282 Opnd0 = Opnd0.getOperand(0);
7287 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7288 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7289 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7290 const X86Subtarget &Subtarget,
7291 SelectionDAG &DAG) {
7292 SDValue Opnd0, Opnd1;
7293 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7296 MVT VT = BV->getSimpleValueType(0);
7299 // Try to generate X86ISD::FMADDSUB node here.
7301 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7302 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7304 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7305 // the ADDSUB idiom has been successfully recognized. There are no known
7306 // X86 targets with 512-bit ADDSUB instructions!
7307 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7309 if (VT.is512BitVector())
7312 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7315 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7316 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7317 const X86Subtarget &Subtarget,
7318 SelectionDAG &DAG) {
7319 MVT VT = BV->getSimpleValueType(0);
7320 unsigned NumElts = VT.getVectorNumElements();
7321 unsigned NumUndefsLO = 0;
7322 unsigned NumUndefsHI = 0;
7323 unsigned Half = NumElts/2;
7325 // Count the number of UNDEF operands in the build_vector in input.
7326 for (unsigned i = 0, e = Half; i != e; ++i)
7327 if (BV->getOperand(i)->isUndef())
7330 for (unsigned i = Half, e = NumElts; i != e; ++i)
7331 if (BV->getOperand(i)->isUndef())
7334 // Early exit if this is either a build_vector of all UNDEFs or all the
7335 // operands but one are UNDEF.
7336 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7340 SDValue InVec0, InVec1;
7341 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7342 // Try to match an SSE3 float HADD/HSUB.
7343 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7344 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7346 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7347 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7348 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7349 // Try to match an SSSE3 integer HADD/HSUB.
7350 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7351 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7353 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7354 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7357 if (!Subtarget.hasAVX())
7360 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7361 // Try to match an AVX horizontal add/sub of packed single/double
7362 // precision floating point values from 256-bit vectors.
7363 SDValue InVec2, InVec3;
7364 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7365 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7366 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7367 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7368 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7370 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7371 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7372 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7373 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7374 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7375 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7376 // Try to match an AVX2 horizontal add/sub of signed integers.
7377 SDValue InVec2, InVec3;
7379 bool CanFold = true;
7381 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7382 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7383 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7384 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7385 X86Opcode = X86ISD::HADD;
7386 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7387 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7388 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7389 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7390 X86Opcode = X86ISD::HSUB;
7395 // Fold this build_vector into a single horizontal add/sub.
7396 // Do this only if the target has AVX2.
7397 if (Subtarget.hasAVX2())
7398 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7400 // Do not try to expand this build_vector into a pair of horizontal
7401 // add/sub if we can emit a pair of scalar add/sub.
7402 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7405 // Convert this build_vector into a pair of horizontal binop followed by
7407 bool isUndefLO = NumUndefsLO == Half;
7408 bool isUndefHI = NumUndefsHI == Half;
7409 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7410 isUndefLO, isUndefHI);
7414 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7415 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7417 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7418 X86Opcode = X86ISD::HADD;
7419 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7420 X86Opcode = X86ISD::HSUB;
7421 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7422 X86Opcode = X86ISD::FHADD;
7423 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7424 X86Opcode = X86ISD::FHSUB;
7428 // Don't try to expand this build_vector into a pair of horizontal add/sub
7429 // if we can simply emit a pair of scalar add/sub.
7430 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7433 // Convert this build_vector into two horizontal add/sub followed by
7435 bool isUndefLO = NumUndefsLO == Half;
7436 bool isUndefHI = NumUndefsHI == Half;
7437 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7438 isUndefLO, isUndefHI);
7444 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7445 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7446 /// just apply the bit to the vectors.
7447 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7448 /// from this, but enough scalar bit operations are created from the later
7449 /// legalization + scalarization stages to need basic support.
7450 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7451 SelectionDAG &DAG) {
7453 MVT VT = Op->getSimpleValueType(0);
7454 unsigned NumElems = VT.getVectorNumElements();
7455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7457 // Check that all elements have the same opcode.
7458 // TODO: Should we allow UNDEFS and if so how many?
7459 unsigned Opcode = Op->getOperand(0).getOpcode();
7460 for (unsigned i = 1; i < NumElems; ++i)
7461 if (Opcode != Op->getOperand(i).getOpcode())
7464 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7471 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7476 SmallVector<SDValue, 4> LHSElts, RHSElts;
7477 for (SDValue Elt : Op->ops()) {
7478 SDValue LHS = Elt.getOperand(0);
7479 SDValue RHS = Elt.getOperand(1);
7481 // We expect the canonicalized RHS operand to be the constant.
7482 if (!isa<ConstantSDNode>(RHS))
7484 LHSElts.push_back(LHS);
7485 RHSElts.push_back(RHS);
7488 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7489 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7490 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7493 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7494 /// functionality to do this, so it's all zeros, all ones, or some derivation
7495 /// that is cheap to calculate.
7496 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7497 const X86Subtarget &Subtarget) {
7499 MVT VT = Op.getSimpleValueType();
7501 // Vectors containing all zeros can be matched by pxor and xorps.
7502 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7503 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7504 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7505 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7508 return getZeroVector(VT, Subtarget, DAG, DL);
7511 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7512 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7513 // vpcmpeqd on 256-bit vectors.
7514 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7515 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7516 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7519 return getOnesVector(VT, DAG, DL);
7526 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7529 MVT VT = Op.getSimpleValueType();
7530 MVT ExtVT = VT.getVectorElementType();
7531 unsigned NumElems = Op.getNumOperands();
7533 // Generate vectors for predicate vectors.
7534 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7535 return LowerBUILD_VECTORvXi1(Op, DAG);
7537 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7538 return VectorConstant;
7540 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7541 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7543 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7544 return HorizontalOp;
7545 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7547 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7550 unsigned EVTBits = ExtVT.getSizeInBits();
7552 unsigned NumZero = 0;
7553 unsigned NumNonZero = 0;
7554 uint64_t NonZeros = 0;
7555 bool IsAllConstants = true;
7556 SmallSet<SDValue, 8> Values;
7557 for (unsigned i = 0; i < NumElems; ++i) {
7558 SDValue Elt = Op.getOperand(i);
7562 if (Elt.getOpcode() != ISD::Constant &&
7563 Elt.getOpcode() != ISD::ConstantFP)
7564 IsAllConstants = false;
7565 if (X86::isZeroNode(Elt))
7568 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7569 NonZeros |= ((uint64_t)1 << i);
7574 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7575 if (NumNonZero == 0)
7576 return DAG.getUNDEF(VT);
7578 // Special case for single non-zero, non-undef, element.
7579 if (NumNonZero == 1) {
7580 unsigned Idx = countTrailingZeros(NonZeros);
7581 SDValue Item = Op.getOperand(Idx);
7583 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7584 // the value are obviously zero, truncate the value to i32 and do the
7585 // insertion that way. Only do this if the value is non-constant or if the
7586 // value is a constant being inserted into element 0. It is cheaper to do
7587 // a constant pool load than it is to do a movd + shuffle.
7588 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7589 (!IsAllConstants || Idx == 0)) {
7590 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7592 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7593 MVT VecVT = MVT::v4i32;
7595 // Truncate the value (which may itself be a constant) to i32, and
7596 // convert it to a vector with movd (S2V+shuffle to zero extend).
7597 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7598 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7599 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7600 Item, Idx * 2, true, Subtarget, DAG));
7604 // If we have a constant or non-constant insertion into the low element of
7605 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7606 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7607 // depending on what the source datatype is.
7610 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7612 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7613 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7614 assert((VT.is128BitVector() || VT.is256BitVector() ||
7615 VT.is512BitVector()) &&
7616 "Expected an SSE value type!");
7617 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7618 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7619 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7622 // We can't directly insert an i8 or i16 into a vector, so zero extend
7624 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7625 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7626 if (VT.getSizeInBits() >= 256) {
7627 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7628 if (Subtarget.hasAVX()) {
7629 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7630 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7632 // Without AVX, we need to extend to a 128-bit vector and then
7633 // insert into the 256-bit vector.
7634 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7635 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7636 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7639 assert(VT.is128BitVector() && "Expected an SSE value type!");
7640 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7641 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7643 return DAG.getBitcast(VT, Item);
7647 // Is it a vector logical left shift?
7648 if (NumElems == 2 && Idx == 1 &&
7649 X86::isZeroNode(Op.getOperand(0)) &&
7650 !X86::isZeroNode(Op.getOperand(1))) {
7651 unsigned NumBits = VT.getSizeInBits();
7652 return getVShift(true, VT,
7653 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7654 VT, Op.getOperand(1)),
7655 NumBits/2, DAG, *this, dl);
7658 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7661 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7662 // is a non-constant being inserted into an element other than the low one,
7663 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7664 // movd/movss) to move this into the low element, then shuffle it into
7666 if (EVTBits == 32) {
7667 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7668 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7672 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7673 if (Values.size() == 1) {
7674 if (EVTBits == 32) {
7675 // Instead of a shuffle like this:
7676 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7677 // Check if it's possible to issue this instead.
7678 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7679 unsigned Idx = countTrailingZeros(NonZeros);
7680 SDValue Item = Op.getOperand(Idx);
7681 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7682 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7687 // A vector full of immediates; various special cases are already
7688 // handled, so this is best done with a single constant-pool load.
7692 // See if we can use a vector load to get all of the elements.
7693 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7694 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7695 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7699 // For AVX-length vectors, build the individual 128-bit pieces and use
7700 // shuffles to put them in place.
7701 if (VT.is256BitVector() || VT.is512BitVector()) {
7702 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7704 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7706 // Build both the lower and upper subvector.
7708 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7709 SDValue Upper = DAG.getBuildVector(
7710 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7712 // Recreate the wider vector with the lower and upper part.
7713 if (VT.is256BitVector())
7714 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7715 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7718 // Let legalizer expand 2-wide build_vectors.
7719 if (EVTBits == 64) {
7720 if (NumNonZero == 1) {
7721 // One half is zero or undef.
7722 unsigned Idx = countTrailingZeros(NonZeros);
7723 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7724 Op.getOperand(Idx));
7725 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7730 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7731 if (EVTBits == 8 && NumElems == 16)
7732 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7736 if (EVTBits == 16 && NumElems == 8)
7737 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7741 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7742 if (EVTBits == 32 && NumElems == 4)
7743 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7746 // If element VT is == 32 bits, turn it into a number of shuffles.
7747 if (NumElems == 4 && NumZero > 0) {
7748 SmallVector<SDValue, 8> Ops(NumElems);
7749 for (unsigned i = 0; i < 4; ++i) {
7750 bool isZero = !(NonZeros & (1ULL << i));
7752 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7754 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7757 for (unsigned i = 0; i < 2; ++i) {
7758 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7761 Ops[i] = Ops[i*2]; // Must be a zero vector.
7764 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7767 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7770 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7775 bool Reverse1 = (NonZeros & 0x3) == 2;
7776 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7780 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7781 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7783 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7786 if (Values.size() > 1 && VT.is128BitVector()) {
7787 // Check for a build vector from mostly shuffle plus few inserting.
7788 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7791 // For SSE 4.1, use insertps to put the high elements into the low element.
7792 if (Subtarget.hasSSE41()) {
7794 if (!Op.getOperand(0).isUndef())
7795 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7797 Result = DAG.getUNDEF(VT);
7799 for (unsigned i = 1; i < NumElems; ++i) {
7800 if (Op.getOperand(i).isUndef()) continue;
7801 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7802 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7807 // Otherwise, expand into a number of unpckl*, start by extending each of
7808 // our (non-undef) elements to the full vector width with the element in the
7809 // bottom slot of the vector (which generates no code for SSE).
7810 SmallVector<SDValue, 8> Ops(NumElems);
7811 for (unsigned i = 0; i < NumElems; ++i) {
7812 if (!Op.getOperand(i).isUndef())
7813 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7815 Ops[i] = DAG.getUNDEF(VT);
7818 // Next, we iteratively mix elements, e.g. for v4f32:
7819 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7820 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7821 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7822 unsigned EltStride = NumElems >> 1;
7823 while (EltStride != 0) {
7824 for (unsigned i = 0; i < EltStride; ++i) {
7825 // If Ops[i+EltStride] is undef and this is the first round of mixing,
7826 // then it is safe to just drop this shuffle: V[i] is already in the
7827 // right place, the one element (since it's the first round) being
7828 // inserted as undef can be dropped. This isn't safe for successive
7829 // rounds because they will permute elements within both vectors.
7830 if (Ops[i+EltStride].isUndef() &&
7831 EltStride == NumElems/2)
7834 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
7843 // 256-bit AVX can use the vinsertf128 instruction
7844 // to create 256-bit vectors from two other 128-bit ones.
7845 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7847 MVT ResVT = Op.getSimpleValueType();
7849 assert((ResVT.is256BitVector() ||
7850 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7852 SDValue V1 = Op.getOperand(0);
7853 SDValue V2 = Op.getOperand(1);
7854 unsigned NumElems = ResVT.getVectorNumElements();
7855 if (ResVT.is256BitVector())
7856 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7858 if (Op.getNumOperands() == 4) {
7859 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7860 ResVT.getVectorNumElements()/2);
7861 SDValue V3 = Op.getOperand(2);
7862 SDValue V4 = Op.getOperand(3);
7863 return concat256BitVectors(
7864 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7865 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7868 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7871 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7872 const X86Subtarget &Subtarget,
7873 SelectionDAG & DAG) {
7875 MVT ResVT = Op.getSimpleValueType();
7876 unsigned NumOfOperands = Op.getNumOperands();
7878 assert(isPowerOf2_32(NumOfOperands) &&
7879 "Unexpected number of operands in CONCAT_VECTORS");
7881 SDValue Undef = DAG.getUNDEF(ResVT);
7882 if (NumOfOperands > 2) {
7883 // Specialize the cases when all, or all but one, of the operands are undef.
7884 unsigned NumOfDefinedOps = 0;
7886 for (unsigned i = 0; i < NumOfOperands; i++)
7887 if (!Op.getOperand(i).isUndef()) {
7891 if (NumOfDefinedOps == 0)
7893 if (NumOfDefinedOps == 1) {
7894 unsigned SubVecNumElts =
7895 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7896 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7897 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7898 Op.getOperand(OpIdx), IdxVal);
7901 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7902 ResVT.getVectorNumElements()/2);
7903 SmallVector<SDValue, 2> Ops;
7904 for (unsigned i = 0; i < NumOfOperands/2; i++)
7905 Ops.push_back(Op.getOperand(i));
7906 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7908 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
7909 Ops.push_back(Op.getOperand(i));
7910 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7911 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
7915 SDValue V1 = Op.getOperand(0);
7916 SDValue V2 = Op.getOperand(1);
7917 unsigned NumElems = ResVT.getVectorNumElements();
7918 assert(V1.getValueType() == V2.getValueType() &&
7919 V1.getValueType().getVectorNumElements() == NumElems/2 &&
7920 "Unexpected operands in CONCAT_VECTORS");
7922 if (ResVT.getSizeInBits() >= 16)
7923 return Op; // The operation is legal with KUNPCK
7925 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
7926 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
7927 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
7928 if (IsZeroV1 && IsZeroV2)
7931 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
7933 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7935 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
7937 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
7939 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
7942 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
7944 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7945 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7948 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7949 const X86Subtarget &Subtarget,
7950 SelectionDAG &DAG) {
7951 MVT VT = Op.getSimpleValueType();
7952 if (VT.getVectorElementType() == MVT::i1)
7953 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7955 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7956 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7957 Op.getNumOperands() == 4)));
7959 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7960 // from two other 128-bit ones.
7962 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7963 return LowerAVXCONCAT_VECTORS(Op, DAG);
7966 //===----------------------------------------------------------------------===//
7967 // Vector shuffle lowering
7969 // This is an experimental code path for lowering vector shuffles on x86. It is
7970 // designed to handle arbitrary vector shuffles and blends, gracefully
7971 // degrading performance as necessary. It works hard to recognize idiomatic
7972 // shuffles and lower them to optimal instruction patterns without leaving
7973 // a framework that allows reasonably efficient handling of all vector shuffle
7975 //===----------------------------------------------------------------------===//
7977 /// \brief Tiny helper function to identify a no-op mask.
7979 /// This is a somewhat boring predicate function. It checks whether the mask
7980 /// array input, which is assumed to be a single-input shuffle mask of the kind
7981 /// used by the X86 shuffle instructions (not a fully general
7982 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7983 /// in-place shuffle are 'no-op's.
7984 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7985 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7986 assert(Mask[i] >= -1 && "Out of bound mask element!");
7987 if (Mask[i] >= 0 && Mask[i] != i)
7993 /// \brief Test whether there are elements crossing 128-bit lanes in this
7996 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7997 /// and we routinely test for these.
7998 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7999 int LaneSize = 128 / VT.getScalarSizeInBits();
8000 int Size = Mask.size();
8001 for (int i = 0; i < Size; ++i)
8002 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8007 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8009 /// This checks a shuffle mask to see if it is performing the same
8010 /// lane-relative shuffle in each sub-lane. This trivially implies
8011 /// that it is also not lane-crossing. It may however involve a blend from the
8012 /// same lane of a second vector.
8014 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8015 /// non-trivial to compute in the face of undef lanes. The representation is
8016 /// suitable for use with existing 128-bit shuffles as entries from the second
8017 /// vector have been remapped to [LaneSize, 2*LaneSize).
8018 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8020 SmallVectorImpl<int> &RepeatedMask) {
8021 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8022 RepeatedMask.assign(LaneSize, -1);
8023 int Size = Mask.size();
8024 for (int i = 0; i < Size; ++i) {
8025 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8028 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8029 // This entry crosses lanes, so there is no way to model this shuffle.
8032 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8033 // Adjust second vector indices to start at LaneSize instead of Size.
8034 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8035 : Mask[i] % LaneSize + LaneSize;
8036 if (RepeatedMask[i % LaneSize] < 0)
8037 // This is the first non-undef entry in this slot of a 128-bit lane.
8038 RepeatedMask[i % LaneSize] = LocalM;
8039 else if (RepeatedMask[i % LaneSize] != LocalM)
8040 // Found a mismatch with the repeated mask.
8046 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8048 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8049 SmallVectorImpl<int> &RepeatedMask) {
8050 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8053 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8055 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8056 SmallVectorImpl<int> &RepeatedMask) {
8057 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8060 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8061 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8062 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8064 SmallVectorImpl<int> &RepeatedMask) {
8065 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8066 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8067 int Size = Mask.size();
8068 for (int i = 0; i < Size; ++i) {
8069 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8070 if (Mask[i] == SM_SentinelUndef)
8072 if (Mask[i] == SM_SentinelZero) {
8073 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8075 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8078 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8079 // This entry crosses lanes, so there is no way to model this shuffle.
8082 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8083 // Adjust second vector indices to start at LaneSize instead of Size.
8085 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8086 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8087 // This is the first non-undef entry in this slot of a 128-bit lane.
8088 RepeatedMask[i % LaneSize] = LocalM;
8089 else if (RepeatedMask[i % LaneSize] != LocalM)
8090 // Found a mismatch with the repeated mask.
8096 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8099 /// This is a fast way to test a shuffle mask against a fixed pattern:
8101 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8103 /// It returns true if the mask is exactly as wide as the argument list, and
8104 /// each element of the mask is either -1 (signifying undef) or the value given
8105 /// in the argument.
8106 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8107 ArrayRef<int> ExpectedMask) {
8108 if (Mask.size() != ExpectedMask.size())
8111 int Size = Mask.size();
8113 // If the values are build vectors, we can look through them to find
8114 // equivalent inputs that make the shuffles equivalent.
8115 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8116 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8118 for (int i = 0; i < Size; ++i) {
8119 assert(Mask[i] >= -1 && "Out of bound mask element!");
8120 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8121 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8122 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8123 if (!MaskBV || !ExpectedBV ||
8124 MaskBV->getOperand(Mask[i] % Size) !=
8125 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8133 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8135 /// The masks must be exactly the same width.
8137 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8138 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8140 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8141 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8142 ArrayRef<int> ExpectedMask) {
8143 int Size = Mask.size();
8144 if (Size != (int)ExpectedMask.size())
8147 for (int i = 0; i < Size; ++i)
8148 if (Mask[i] == SM_SentinelUndef)
8150 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8152 else if (Mask[i] != ExpectedMask[i])
8158 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8160 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8161 const APInt &Zeroable) {
8162 int NumElts = Mask.size();
8163 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8165 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8166 for (int i = 0; i != NumElts; ++i) {
8168 if (M == SM_SentinelUndef)
8170 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8171 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8176 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8178 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8179 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8182 SmallVector<int, 8> Unpcklwd;
8183 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8184 /* Unary = */ false);
8185 SmallVector<int, 8> Unpckhwd;
8186 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8187 /* Unary = */ false);
8188 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8189 isTargetShuffleEquivalent(Mask, Unpckhwd));
8190 return IsUnpackwdMask;
8193 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8195 /// This helper function produces an 8-bit shuffle immediate corresponding to
8196 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8197 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8200 /// NB: We rely heavily on "undef" masks preserving the input lane.
8201 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8202 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8203 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8204 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8205 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8206 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8209 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8210 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8211 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8212 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8216 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8217 SelectionDAG &DAG) {
8218 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8221 /// \brief Compute whether each element of a shuffle is zeroable.
8223 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8224 /// Either it is an undef element in the shuffle mask, the element of the input
8225 /// referenced is undef, or the element of the input referenced is known to be
8226 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8227 /// as many lanes with this technique as possible to simplify the remaining
8229 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8230 SDValue V1, SDValue V2) {
8231 APInt Zeroable(Mask.size(), 0);
8232 V1 = peekThroughBitcasts(V1);
8233 V2 = peekThroughBitcasts(V2);
8235 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8236 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8238 int VectorSizeInBits = V1.getValueSizeInBits();
8239 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8240 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8242 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8244 // Handle the easy cases.
8245 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8250 // Determine shuffle input and normalize the mask.
8251 SDValue V = M < Size ? V1 : V2;
8254 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8255 if (V.getOpcode() != ISD::BUILD_VECTOR)
8258 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8259 // the (larger) source element must be UNDEF/ZERO.
8260 if ((Size % V.getNumOperands()) == 0) {
8261 int Scale = Size / V->getNumOperands();
8262 SDValue Op = V.getOperand(M / Scale);
8263 if (Op.isUndef() || X86::isZeroNode(Op))
8265 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8266 APInt Val = Cst->getAPIntValue();
8267 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8268 Val = Val.getLoBits(ScalarSizeInBits);
8271 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8272 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8273 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8274 Val = Val.getLoBits(ScalarSizeInBits);
8281 // If the BUILD_VECTOR has more elements then all the (smaller) source
8282 // elements must be UNDEF or ZERO.
8283 if ((V.getNumOperands() % Size) == 0) {
8284 int Scale = V->getNumOperands() / Size;
8285 bool AllZeroable = true;
8286 for (int j = 0; j < Scale; ++j) {
8287 SDValue Op = V.getOperand((M * Scale) + j);
8288 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8299 // The Shuffle result is as follow:
8300 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8301 // Each Zeroable's element correspond to a particular Mask's element.
8302 // As described in computeZeroableShuffleElements function.
8304 // The function looks for a sub-mask that the nonzero elements are in
8305 // increasing order. If such sub-mask exist. The function returns true.
8306 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8307 ArrayRef<int> Mask, const EVT &VectorType,
8308 bool &IsZeroSideLeft) {
8309 int NextElement = -1;
8310 // Check if the Mask's nonzero elements are in increasing order.
8311 for (int i = 0, e = Mask.size(); i < e; i++) {
8312 // Checks if the mask's zeros elements are built from only zeros.
8313 assert(Mask[i] >= -1 && "Out of bound mask element!");
8318 // Find the lowest non zero element
8319 if (NextElement < 0) {
8320 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8321 IsZeroSideLeft = NextElement != 0;
8323 // Exit if the mask's non zero elements are not in increasing order.
8324 if (NextElement != Mask[i])
8331 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8332 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8333 ArrayRef<int> Mask, SDValue V1,
8335 const APInt &Zeroable,
8336 const X86Subtarget &Subtarget,
8337 SelectionDAG &DAG) {
8338 int Size = Mask.size();
8339 int LaneSize = 128 / VT.getScalarSizeInBits();
8340 const int NumBytes = VT.getSizeInBits() / 8;
8341 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8343 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8344 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8345 (Subtarget.hasBWI() && VT.is512BitVector()));
8347 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8348 // Sign bit set in i8 mask means zero element.
8349 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8352 for (int i = 0; i < NumBytes; ++i) {
8353 int M = Mask[i / NumEltBytes];
8355 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8358 if (Zeroable[i / NumEltBytes]) {
8359 PSHUFBMask[i] = ZeroMask;
8363 // We can only use a single input of V1 or V2.
8364 SDValue SrcV = (M >= Size ? V2 : V1);
8370 // PSHUFB can't cross lanes, ensure this doesn't happen.
8371 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8375 M = M * NumEltBytes + (i % NumEltBytes);
8376 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8378 assert(V && "Failed to find a source input");
8380 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8381 return DAG.getBitcast(
8382 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8383 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8386 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8387 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8390 // X86 has dedicated shuffle that can be lowered to VEXPAND
8391 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8392 const APInt &Zeroable,
8393 ArrayRef<int> Mask, SDValue &V1,
8394 SDValue &V2, SelectionDAG &DAG,
8395 const X86Subtarget &Subtarget) {
8396 bool IsLeftZeroSide = true;
8397 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8400 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8402 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8403 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8404 unsigned NumElts = VT.getVectorNumElements();
8405 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8406 "Unexpected number of vector elements");
8407 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8408 Subtarget, DAG, DL);
8409 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8410 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8411 return DAG.getSelect(DL, VT, VMask,
8412 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8416 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8417 unsigned &UnpackOpcode, bool IsUnary,
8418 ArrayRef<int> TargetMask, SDLoc &DL,
8420 const X86Subtarget &Subtarget) {
8421 int NumElts = VT.getVectorNumElements();
8423 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8424 for (int i = 0; i != NumElts; i += 2) {
8425 int M1 = TargetMask[i + 0];
8426 int M2 = TargetMask[i + 1];
8427 Undef1 &= (SM_SentinelUndef == M1);
8428 Undef2 &= (SM_SentinelUndef == M2);
8429 Zero1 &= isUndefOrZero(M1);
8430 Zero2 &= isUndefOrZero(M2);
8432 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8433 "Zeroable shuffle detected");
8435 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8436 SmallVector<int, 64> Unpckl, Unpckh;
8437 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8438 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8439 UnpackOpcode = X86ISD::UNPCKL;
8440 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8441 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8445 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8446 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8447 UnpackOpcode = X86ISD::UNPCKH;
8448 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8449 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8453 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8454 if (IsUnary && (Zero1 || Zero2)) {
8455 // Don't bother if we can blend instead.
8456 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8457 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8460 bool MatchLo = true, MatchHi = true;
8461 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8462 int M = TargetMask[i];
8464 // Ignore if the input is known to be zero or the index is undef.
8465 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8466 (M == SM_SentinelUndef))
8469 MatchLo &= (M == Unpckl[i]);
8470 MatchHi &= (M == Unpckh[i]);
8473 if (MatchLo || MatchHi) {
8474 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8475 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8476 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8481 // If a binary shuffle, commute and try again.
8483 ShuffleVectorSDNode::commuteMask(Unpckl);
8484 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8485 UnpackOpcode = X86ISD::UNPCKL;
8490 ShuffleVectorSDNode::commuteMask(Unpckh);
8491 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8492 UnpackOpcode = X86ISD::UNPCKH;
8501 // X86 has dedicated unpack instructions that can handle specific blend
8502 // operations: UNPCKH and UNPCKL.
8503 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8504 ArrayRef<int> Mask, SDValue V1,
8505 SDValue V2, SelectionDAG &DAG) {
8506 SmallVector<int, 8> Unpckl;
8507 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8508 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8509 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8511 SmallVector<int, 8> Unpckh;
8512 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8513 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8514 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8516 // Commute and try again.
8517 ShuffleVectorSDNode::commuteMask(Unpckl);
8518 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8519 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8521 ShuffleVectorSDNode::commuteMask(Unpckh);
8522 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8523 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8528 /// \brief Try to emit a bitmask instruction for a shuffle.
8530 /// This handles cases where we can model a blend exactly as a bitmask due to
8531 /// one of the inputs being zeroable.
8532 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8533 SDValue V2, ArrayRef<int> Mask,
8534 const APInt &Zeroable,
8535 SelectionDAG &DAG) {
8536 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8537 MVT EltVT = VT.getVectorElementType();
8538 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8539 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8540 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8542 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8545 if (Mask[i] % Size != i)
8546 return SDValue(); // Not a blend.
8548 V = Mask[i] < Size ? V1 : V2;
8549 else if (V != (Mask[i] < Size ? V1 : V2))
8550 return SDValue(); // Can only let one input through the mask.
8552 VMaskOps[i] = AllOnes;
8555 return SDValue(); // No non-zeroable elements!
8557 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8558 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8561 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8563 /// This is used as a fallback approach when first class blend instructions are
8564 /// unavailable. Currently it is only suitable for integer vectors, but could
8565 /// be generalized for floating point vectors if desirable.
8566 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8567 SDValue V2, ArrayRef<int> Mask,
8568 SelectionDAG &DAG) {
8569 assert(VT.isInteger() && "Only supports integer vector types!");
8570 MVT EltVT = VT.getVectorElementType();
8571 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8572 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8573 SmallVector<SDValue, 16> MaskOps;
8574 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8575 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8576 return SDValue(); // Shuffled input!
8577 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8580 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8581 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8582 // We have to cast V2 around.
8583 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8584 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8585 DAG.getBitcast(MaskVT, V1Mask),
8586 DAG.getBitcast(MaskVT, V2)));
8587 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8590 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8591 SDValue PreservedSrc,
8592 const X86Subtarget &Subtarget,
8595 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8596 MutableArrayRef<int> TargetMask,
8597 bool &ForceV1Zero, bool &ForceV2Zero,
8598 uint64_t &BlendMask) {
8599 bool V1IsZeroOrUndef =
8600 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8601 bool V2IsZeroOrUndef =
8602 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8605 ForceV1Zero = false, ForceV2Zero = false;
8606 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8608 // Attempt to generate the binary blend mask. If an input is zero then
8609 // we can use any lane.
8610 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8611 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8612 int M = TargetMask[i];
8613 if (M == SM_SentinelUndef)
8617 if (M == i + Size) {
8618 BlendMask |= 1ull << i;
8621 if (M == SM_SentinelZero) {
8622 if (V1IsZeroOrUndef) {
8627 if (V2IsZeroOrUndef) {
8629 BlendMask |= 1ull << i;
8630 TargetMask[i] = i + Size;
8639 uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8640 uint64_t ScaledMask = 0;
8641 for (int i = 0; i != Size; ++i)
8642 if (BlendMask & (1ull << i))
8643 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8647 /// \brief Try to emit a blend instruction for a shuffle.
8649 /// This doesn't do any checks for the availability of instructions for blending
8650 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8651 /// be matched in the backend with the type given. What it does check for is
8652 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8653 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8654 SDValue V2, ArrayRef<int> Original,
8655 const APInt &Zeroable,
8656 const X86Subtarget &Subtarget,
8657 SelectionDAG &DAG) {
8658 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8660 uint64_t BlendMask = 0;
8661 bool ForceV1Zero = false, ForceV2Zero = false;
8662 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8666 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8668 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8670 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8672 switch (VT.SimpleTy) {
8677 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8678 DAG.getConstant(BlendMask, DL, MVT::i8));
8682 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8686 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8687 // that instruction.
8688 if (Subtarget.hasAVX2()) {
8689 // Scale the blend by the number of 32-bit dwords per element.
8690 int Scale = VT.getScalarSizeInBits() / 32;
8691 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8692 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8693 V1 = DAG.getBitcast(BlendVT, V1);
8694 V2 = DAG.getBitcast(BlendVT, V2);
8695 return DAG.getBitcast(
8696 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8697 DAG.getConstant(BlendMask, DL, MVT::i8)));
8701 // For integer shuffles we need to expand the mask and cast the inputs to
8702 // v8i16s prior to blending.
8703 int Scale = 8 / VT.getVectorNumElements();
8704 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8705 V1 = DAG.getBitcast(MVT::v8i16, V1);
8706 V2 = DAG.getBitcast(MVT::v8i16, V2);
8707 return DAG.getBitcast(VT,
8708 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8709 DAG.getConstant(BlendMask, DL, MVT::i8)));
8713 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8714 SmallVector<int, 8> RepeatedMask;
8715 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8716 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8717 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8719 for (int i = 0; i < 8; ++i)
8720 if (RepeatedMask[i] >= 8)
8721 BlendMask |= 1ull << i;
8722 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8723 DAG.getConstant(BlendMask, DL, MVT::i8));
8729 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8730 "256-bit byte-blends require AVX2 support!");
8732 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8734 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8735 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8736 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8739 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8740 if (SDValue Masked =
8741 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8744 // Scale the blend by the number of bytes per element.
8745 int Scale = VT.getScalarSizeInBits() / 8;
8747 // This form of blend is always done on bytes. Compute the byte vector
8749 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8751 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8752 // mix of LLVM's code generator and the x86 backend. We tell the code
8753 // generator that boolean values in the elements of an x86 vector register
8754 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8755 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8756 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8757 // of the element (the remaining are ignored) and 0 in that high bit would
8758 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8759 // the LLVM model for boolean values in vector elements gets the relevant
8760 // bit set, it is set backwards and over constrained relative to x86's
8762 SmallVector<SDValue, 32> VSELECTMask;
8763 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8764 for (int j = 0; j < Scale; ++j)
8765 VSELECTMask.push_back(
8766 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8767 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8770 V1 = DAG.getBitcast(BlendVT, V1);
8771 V2 = DAG.getBitcast(BlendVT, V2);
8772 return DAG.getBitcast(
8774 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
8784 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8785 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8786 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8789 llvm_unreachable("Not a supported integer vector type!");
8793 /// \brief Try to lower as a blend of elements from two inputs followed by
8794 /// a single-input permutation.
8796 /// This matches the pattern where we can blend elements from two inputs and
8797 /// then reduce the shuffle to a single-input permutation.
8798 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8799 SDValue V1, SDValue V2,
8801 SelectionDAG &DAG) {
8802 // We build up the blend mask while checking whether a blend is a viable way
8803 // to reduce the shuffle.
8804 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8805 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8807 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8811 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8813 if (BlendMask[Mask[i] % Size] < 0)
8814 BlendMask[Mask[i] % Size] = Mask[i];
8815 else if (BlendMask[Mask[i] % Size] != Mask[i])
8816 return SDValue(); // Can't blend in the needed input!
8818 PermuteMask[i] = Mask[i] % Size;
8821 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8822 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8825 /// \brief Generic routine to decompose a shuffle and blend into independent
8826 /// blends and permutes.
8828 /// This matches the extremely common pattern for handling combined
8829 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8830 /// operations. It will try to pick the best arrangement of shuffles and
8832 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8836 SelectionDAG &DAG) {
8837 // Shuffle the input elements into the desired positions in V1 and V2 and
8838 // blend them together.
8839 SmallVector<int, 32> V1Mask(Mask.size(), -1);
8840 SmallVector<int, 32> V2Mask(Mask.size(), -1);
8841 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8842 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8843 if (Mask[i] >= 0 && Mask[i] < Size) {
8844 V1Mask[i] = Mask[i];
8846 } else if (Mask[i] >= Size) {
8847 V2Mask[i] = Mask[i] - Size;
8848 BlendMask[i] = i + Size;
8851 // Try to lower with the simpler initial blend strategy unless one of the
8852 // input shuffles would be a no-op. We prefer to shuffle inputs as the
8853 // shuffle may be able to fold with a load or other benefit. However, when
8854 // we'll have to do 2x as many shuffles in order to achieve this, blending
8855 // first is a better strategy.
8856 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
8857 if (SDValue BlendPerm =
8858 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8861 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8862 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8863 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8866 /// \brief Try to lower a vector shuffle as a rotation.
8868 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8869 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8870 ArrayRef<int> Mask) {
8871 int NumElts = Mask.size();
8873 // We need to detect various ways of spelling a rotation:
8874 // [11, 12, 13, 14, 15, 0, 1, 2]
8875 // [-1, 12, 13, 14, -1, -1, 1, -1]
8876 // [-1, -1, -1, -1, -1, -1, 1, 2]
8877 // [ 3, 4, 5, 6, 7, 8, 9, 10]
8878 // [-1, 4, 5, 6, -1, -1, 9, -1]
8879 // [-1, 4, 5, 6, -1, -1, -1, -1]
8882 for (int i = 0; i < NumElts; ++i) {
8884 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8885 "Unexpected mask index.");
8889 // Determine where a rotated vector would have started.
8890 int StartIdx = i - (M % NumElts);
8892 // The identity rotation isn't interesting, stop.
8895 // If we found the tail of a vector the rotation must be the missing
8896 // front. If we found the head of a vector, it must be how much of the
8898 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
8901 Rotation = CandidateRotation;
8902 else if (Rotation != CandidateRotation)
8903 // The rotations don't match, so we can't match this mask.
8906 // Compute which value this mask is pointing at.
8907 SDValue MaskV = M < NumElts ? V1 : V2;
8909 // Compute which of the two target values this index should be assigned
8910 // to. This reflects whether the high elements are remaining or the low
8911 // elements are remaining.
8912 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
8914 // Either set up this value if we've not encountered it before, or check
8915 // that it remains consistent.
8918 else if (TargetV != MaskV)
8919 // This may be a rotation, but it pulls from the inputs in some
8920 // unsupported interleaving.
8924 // Check that we successfully analyzed the mask, and normalize the results.
8925 assert(Rotation != 0 && "Failed to locate a viable rotation!");
8926 assert((Lo || Hi) && "Failed to find a rotated input vector!");
8938 /// \brief Try to lower a vector shuffle as a byte rotation.
8940 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
8941 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
8942 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
8943 /// try to generically lower a vector shuffle through such an pattern. It
8944 /// does not check for the profitability of lowering either as PALIGNR or
8945 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
8946 /// This matches shuffle vectors that look like:
8948 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
8950 /// Essentially it concatenates V1 and V2, shifts right by some number of
8951 /// elements, and takes the low elements as the result. Note that while this is
8952 /// specified as a *right shift* because x86 is little-endian, it is a *left
8953 /// rotate* of the vector lanes.
8954 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
8955 ArrayRef<int> Mask) {
8956 // Don't accept any shuffles with zero elements.
8957 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
8960 // PALIGNR works on 128-bit lanes.
8961 SmallVector<int, 16> RepeatedMask;
8962 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
8965 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
8969 // PALIGNR rotates bytes, so we need to scale the
8970 // rotation based on how many bytes are in the vector lane.
8971 int NumElts = RepeatedMask.size();
8972 int Scale = 16 / NumElts;
8973 return Rotation * Scale;
8976 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
8977 SDValue V1, SDValue V2,
8979 const X86Subtarget &Subtarget,
8980 SelectionDAG &DAG) {
8981 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
8983 SDValue Lo = V1, Hi = V2;
8984 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
8985 if (ByteRotation <= 0)
8988 // Cast the inputs to i8 vector of correct length to match PALIGNR or
8990 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8991 Lo = DAG.getBitcast(ByteVT, Lo);
8992 Hi = DAG.getBitcast(ByteVT, Hi);
8994 // SSSE3 targets can use the palignr instruction.
8995 if (Subtarget.hasSSSE3()) {
8996 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
8997 "512-bit PALIGNR requires BWI instructions");
8998 return DAG.getBitcast(
8999 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9000 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9003 assert(VT.is128BitVector() &&
9004 "Rotate-based lowering only supports 128-bit lowering!");
9005 assert(Mask.size() <= 16 &&
9006 "Can shuffle at most 16 bytes in a 128-bit vector!");
9007 assert(ByteVT == MVT::v16i8 &&
9008 "SSE2 rotate lowering only needed for v16i8!");
9010 // Default SSE2 implementation
9011 int LoByteShift = 16 - ByteRotation;
9012 int HiByteShift = ByteRotation;
9014 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9015 DAG.getConstant(LoByteShift, DL, MVT::i8));
9016 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9017 DAG.getConstant(HiByteShift, DL, MVT::i8));
9018 return DAG.getBitcast(VT,
9019 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9022 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9024 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9025 /// rotation of the concatenation of two vectors; This routine will
9026 /// try to generically lower a vector shuffle through such an pattern.
9028 /// Essentially it concatenates V1 and V2, shifts right by some number of
9029 /// elements, and takes the low elements as the result. Note that while this is
9030 /// specified as a *right shift* because x86 is little-endian, it is a *left
9031 /// rotate* of the vector lanes.
9032 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9033 SDValue V1, SDValue V2,
9035 const X86Subtarget &Subtarget,
9036 SelectionDAG &DAG) {
9037 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9038 "Only 32-bit and 64-bit elements are supported!");
9040 // 128/256-bit vectors are only supported with VLX.
9041 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9042 && "VLX required for 128/256-bit vectors");
9044 SDValue Lo = V1, Hi = V2;
9045 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9049 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9050 DAG.getConstant(Rotation, DL, MVT::i8));
9053 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9055 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9056 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9057 /// matches elements from one of the input vectors shuffled to the left or
9058 /// right with zeroable elements 'shifted in'. It handles both the strictly
9059 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9062 /// PSHL : (little-endian) left bit shift.
9063 /// [ zz, 0, zz, 2 ]
9064 /// [ -1, 4, zz, -1 ]
9065 /// PSRL : (little-endian) right bit shift.
9067 /// [ -1, -1, 7, zz]
9068 /// PSLLDQ : (little-endian) left byte shift
9069 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9070 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9071 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9072 /// PSRLDQ : (little-endian) right byte shift
9073 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9074 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9075 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9076 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9077 unsigned ScalarSizeInBits,
9078 ArrayRef<int> Mask, int MaskOffset,
9079 const APInt &Zeroable,
9080 const X86Subtarget &Subtarget) {
9081 int Size = Mask.size();
9082 unsigned SizeInBits = Size * ScalarSizeInBits;
9084 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9085 for (int i = 0; i < Size; i += Scale)
9086 for (int j = 0; j < Shift; ++j)
9087 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9093 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9094 for (int i = 0; i != Size; i += Scale) {
9095 unsigned Pos = Left ? i + Shift : i;
9096 unsigned Low = Left ? i : i + Shift;
9097 unsigned Len = Scale - Shift;
9098 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9102 int ShiftEltBits = ScalarSizeInBits * Scale;
9103 bool ByteShift = ShiftEltBits > 64;
9104 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9105 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9106 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9108 // Normalize the scale for byte shifts to still produce an i64 element
9110 Scale = ByteShift ? Scale / 2 : Scale;
9112 // We need to round trip through the appropriate type for the shift.
9113 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9114 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9115 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9116 return (int)ShiftAmt;
9119 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9120 // keep doubling the size of the integer elements up to that. We can
9121 // then shift the elements of the integer vector by whole multiples of
9122 // their width within the elements of the larger integer vector. Test each
9123 // multiple to see if we can find a match with the moved element indices
9124 // and that the shifted in elements are all zeroable.
9125 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9126 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9127 for (int Shift = 1; Shift != Scale; ++Shift)
9128 for (bool Left : {true, false})
9129 if (CheckZeros(Shift, Scale, Left)) {
9130 int ShiftAmt = MatchShift(Shift, Scale, Left);
9139 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9140 SDValue V2, ArrayRef<int> Mask,
9141 const APInt &Zeroable,
9142 const X86Subtarget &Subtarget,
9143 SelectionDAG &DAG) {
9144 int Size = Mask.size();
9145 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9151 // Try to match shuffle against V1 shift.
9152 int ShiftAmt = matchVectorShuffleAsShift(
9153 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9155 // If V1 failed, try to match shuffle against V2 shift.
9158 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9159 Mask, Size, Zeroable, Subtarget);
9166 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9167 "Illegal integer vector type");
9168 V = DAG.getBitcast(ShiftVT, V);
9169 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9170 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9171 return DAG.getBitcast(VT, V);
9174 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9175 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9176 SDValue V2, ArrayRef<int> Mask,
9177 const APInt &Zeroable,
9178 SelectionDAG &DAG) {
9179 int Size = Mask.size();
9180 int HalfSize = Size / 2;
9181 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9182 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9184 // Upper half must be undefined.
9185 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9188 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9189 // Remainder of lower half result is zero and upper half is all undef.
9190 auto LowerAsEXTRQ = [&]() {
9191 // Determine the extraction length from the part of the
9192 // lower half that isn't zeroable.
9194 for (; Len > 0; --Len)
9195 if (!Zeroable[Len - 1])
9197 assert(Len > 0 && "Zeroable shuffle mask");
9199 // Attempt to match first Len sequential elements from the lower half.
9202 for (int i = 0; i != Len; ++i) {
9206 SDValue &V = (M < Size ? V1 : V2);
9209 // The extracted elements must start at a valid index and all mask
9210 // elements must be in the lower half.
9211 if (i > M || M >= HalfSize)
9214 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9225 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9226 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9227 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9228 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
9229 DAG.getConstant(BitLen, DL, MVT::i8),
9230 DAG.getConstant(BitIdx, DL, MVT::i8));
9233 if (SDValue ExtrQ = LowerAsEXTRQ())
9236 // INSERTQ: Extract lowest Len elements from lower half of second source and
9237 // insert over first source, starting at Idx.
9238 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9239 auto LowerAsInsertQ = [&]() {
9240 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9243 // Attempt to match first source from mask before insertion point.
9244 if (isUndefInRange(Mask, 0, Idx)) {
9246 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9248 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9254 // Extend the extraction length looking to match both the insertion of
9255 // the second source and the remaining elements of the first.
9256 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9261 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9263 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9269 // Match the remaining elements of the lower half.
9270 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9272 } else if ((!Base || (Base == V1)) &&
9273 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9275 } else if ((!Base || (Base == V2)) &&
9276 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9283 // We may not have a base (first source) - this can safely be undefined.
9285 Base = DAG.getUNDEF(VT);
9287 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9288 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9289 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
9290 DAG.getConstant(BitLen, DL, MVT::i8),
9291 DAG.getConstant(BitIdx, DL, MVT::i8));
9298 if (SDValue InsertQ = LowerAsInsertQ())
9304 /// \brief Lower a vector shuffle as a zero or any extension.
9306 /// Given a specific number of elements, element bit width, and extension
9307 /// stride, produce either a zero or any extension based on the available
9308 /// features of the subtarget. The extended elements are consecutive and
9309 /// begin and can start from an offsetted element index in the input; to
9310 /// avoid excess shuffling the offset must either being in the bottom lane
9311 /// or at the start of a higher lane. All extended elements must be from
9313 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9314 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9315 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9316 assert(Scale > 1 && "Need a scale to extend.");
9317 int EltBits = VT.getScalarSizeInBits();
9318 int NumElements = VT.getVectorNumElements();
9319 int NumEltsPerLane = 128 / EltBits;
9320 int OffsetLane = Offset / NumEltsPerLane;
9321 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9322 "Only 8, 16, and 32 bit elements can be extended.");
9323 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9324 assert(0 <= Offset && "Extension offset must be positive.");
9325 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9326 "Extension offset must be in the first lane or start an upper lane.");
9328 // Check that an index is in same lane as the base offset.
9329 auto SafeOffset = [&](int Idx) {
9330 return OffsetLane == (Idx / NumEltsPerLane);
9333 // Shift along an input so that the offset base moves to the first element.
9334 auto ShuffleOffset = [&](SDValue V) {
9338 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9339 for (int i = 0; i * Scale < NumElements; ++i) {
9340 int SrcIdx = i + Offset;
9341 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9343 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9346 // Found a valid zext mask! Try various lowering strategies based on the
9347 // input type and available ISA extensions.
9348 if (Subtarget.hasSSE41()) {
9349 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9350 // PUNPCK will catch this in a later shuffle match.
9351 if (Offset && Scale == 2 && VT.is128BitVector())
9353 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9354 NumElements / Scale);
9355 InputV = ShuffleOffset(InputV);
9356 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9357 return DAG.getBitcast(VT, InputV);
9360 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9362 // For any extends we can cheat for larger element sizes and use shuffle
9363 // instructions that can fold with a load and/or copy.
9364 if (AnyExt && EltBits == 32) {
9365 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9367 return DAG.getBitcast(
9368 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9369 DAG.getBitcast(MVT::v4i32, InputV),
9370 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9372 if (AnyExt && EltBits == 16 && Scale > 2) {
9373 int PSHUFDMask[4] = {Offset / 2, -1,
9374 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9375 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9376 DAG.getBitcast(MVT::v4i32, InputV),
9377 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9378 int PSHUFWMask[4] = {1, -1, -1, -1};
9379 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9380 return DAG.getBitcast(
9381 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9382 DAG.getBitcast(MVT::v8i16, InputV),
9383 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9386 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9388 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9389 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9390 assert(VT.is128BitVector() && "Unexpected vector width!");
9392 int LoIdx = Offset * EltBits;
9393 SDValue Lo = DAG.getBitcast(
9394 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9395 DAG.getConstant(EltBits, DL, MVT::i8),
9396 DAG.getConstant(LoIdx, DL, MVT::i8)));
9398 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9399 !SafeOffset(Offset + 1))
9400 return DAG.getBitcast(VT, Lo);
9402 int HiIdx = (Offset + 1) * EltBits;
9403 SDValue Hi = DAG.getBitcast(
9404 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9405 DAG.getConstant(EltBits, DL, MVT::i8),
9406 DAG.getConstant(HiIdx, DL, MVT::i8)));
9407 return DAG.getBitcast(VT,
9408 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9411 // If this would require more than 2 unpack instructions to expand, use
9412 // pshufb when available. We can only use more than 2 unpack instructions
9413 // when zero extending i8 elements which also makes it easier to use pshufb.
9414 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9415 assert(NumElements == 16 && "Unexpected byte vector width!");
9416 SDValue PSHUFBMask[16];
9417 for (int i = 0; i < 16; ++i) {
9418 int Idx = Offset + (i / Scale);
9419 PSHUFBMask[i] = DAG.getConstant(
9420 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9422 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9423 return DAG.getBitcast(
9424 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9425 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9428 // If we are extending from an offset, ensure we start on a boundary that
9429 // we can unpack from.
9430 int AlignToUnpack = Offset % (NumElements / Scale);
9431 if (AlignToUnpack) {
9432 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9433 for (int i = AlignToUnpack; i < NumElements; ++i)
9434 ShMask[i - AlignToUnpack] = i;
9435 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9436 Offset -= AlignToUnpack;
9439 // Otherwise emit a sequence of unpacks.
9441 unsigned UnpackLoHi = X86ISD::UNPCKL;
9442 if (Offset >= (NumElements / 2)) {
9443 UnpackLoHi = X86ISD::UNPCKH;
9444 Offset -= (NumElements / 2);
9447 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9448 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9449 : getZeroVector(InputVT, Subtarget, DAG, DL);
9450 InputV = DAG.getBitcast(InputVT, InputV);
9451 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9455 } while (Scale > 1);
9456 return DAG.getBitcast(VT, InputV);
9459 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9461 /// This routine will try to do everything in its power to cleverly lower
9462 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9463 /// check for the profitability of this lowering, it tries to aggressively
9464 /// match this pattern. It will use all of the micro-architectural details it
9465 /// can to emit an efficient lowering. It handles both blends with all-zero
9466 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9467 /// masking out later).
9469 /// The reason we have dedicated lowering for zext-style shuffles is that they
9470 /// are both incredibly common and often quite performance sensitive.
9471 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9472 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9473 const APInt &Zeroable, const X86Subtarget &Subtarget,
9474 SelectionDAG &DAG) {
9475 int Bits = VT.getSizeInBits();
9476 int NumLanes = Bits / 128;
9477 int NumElements = VT.getVectorNumElements();
9478 int NumEltsPerLane = NumElements / NumLanes;
9479 assert(VT.getScalarSizeInBits() <= 32 &&
9480 "Exceeds 32-bit integer zero extension limit");
9481 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9483 // Define a helper function to check a particular ext-scale and lower to it if
9485 auto Lower = [&](int Scale) -> SDValue {
9490 for (int i = 0; i < NumElements; ++i) {
9493 continue; // Valid anywhere but doesn't tell us anything.
9494 if (i % Scale != 0) {
9495 // Each of the extended elements need to be zeroable.
9499 // We no longer are in the anyext case.
9504 // Each of the base elements needs to be consecutive indices into the
9505 // same input vector.
9506 SDValue V = M < NumElements ? V1 : V2;
9507 M = M % NumElements;
9510 Offset = M - (i / Scale);
9511 } else if (InputV != V)
9512 return SDValue(); // Flip-flopping inputs.
9514 // Offset must start in the lowest 128-bit lane or at the start of an
9516 // FIXME: Is it ever worth allowing a negative base offset?
9517 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9518 (Offset % NumEltsPerLane) == 0))
9521 // If we are offsetting, all referenced entries must come from the same
9523 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9526 if ((M % NumElements) != (Offset + (i / Scale)))
9527 return SDValue(); // Non-consecutive strided elements.
9531 // If we fail to find an input, we have a zero-shuffle which should always
9532 // have already been handled.
9533 // FIXME: Maybe handle this here in case during blending we end up with one?
9537 // If we are offsetting, don't extend if we only match a single input, we
9538 // can always do better by using a basic PSHUF or PUNPCK.
9539 if (Offset != 0 && Matches < 2)
9542 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9543 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9546 // The widest scale possible for extending is to a 64-bit integer.
9547 assert(Bits % 64 == 0 &&
9548 "The number of bits in a vector must be divisible by 64 on x86!");
9549 int NumExtElements = Bits / 64;
9551 // Each iteration, try extending the elements half as much, but into twice as
9553 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9554 assert(NumElements % NumExtElements == 0 &&
9555 "The input vector size must be divisible by the extended size.");
9556 if (SDValue V = Lower(NumElements / NumExtElements))
9560 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9564 // Returns one of the source operands if the shuffle can be reduced to a
9565 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9566 auto CanZExtLowHalf = [&]() {
9567 for (int i = NumElements / 2; i != NumElements; ++i)
9570 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9572 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9577 if (SDValue V = CanZExtLowHalf()) {
9578 V = DAG.getBitcast(MVT::v2i64, V);
9579 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9580 return DAG.getBitcast(VT, V);
9583 // No viable ext lowering found.
9587 /// \brief Try to get a scalar value for a specific element of a vector.
9589 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9590 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9591 SelectionDAG &DAG) {
9592 MVT VT = V.getSimpleValueType();
9593 MVT EltVT = VT.getVectorElementType();
9594 V = peekThroughBitcasts(V);
9596 // If the bitcasts shift the element size, we can't extract an equivalent
9598 MVT NewVT = V.getSimpleValueType();
9599 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9602 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9603 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9604 // Ensure the scalar operand is the same size as the destination.
9605 // FIXME: Add support for scalar truncation where possible.
9606 SDValue S = V.getOperand(Idx);
9607 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9608 return DAG.getBitcast(EltVT, S);
9614 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9616 /// This is particularly important because the set of instructions varies
9617 /// significantly based on whether the operand is a load or not.
9618 static bool isShuffleFoldableLoad(SDValue V) {
9619 V = peekThroughBitcasts(V);
9620 return ISD::isNON_EXTLoad(V.getNode());
9623 /// \brief Try to lower insertion of a single element into a zero vector.
9625 /// This is a common pattern that we have especially efficient patterns to lower
9626 /// across all subtarget feature sets.
9627 static SDValue lowerVectorShuffleAsElementInsertion(
9628 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9629 const APInt &Zeroable, const X86Subtarget &Subtarget,
9630 SelectionDAG &DAG) {
9632 MVT EltVT = VT.getVectorElementType();
9635 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9637 bool IsV1Zeroable = true;
9638 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9639 if (i != V2Index && !Zeroable[i]) {
9640 IsV1Zeroable = false;
9644 // Check for a single input from a SCALAR_TO_VECTOR node.
9645 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9646 // all the smarts here sunk into that routine. However, the current
9647 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9648 // vector shuffle lowering is dead.
9649 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9651 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9652 // We need to zext the scalar if it is smaller than an i32.
9653 V2S = DAG.getBitcast(EltVT, V2S);
9654 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9655 // Using zext to expand a narrow element won't work for non-zero
9660 // Zero-extend directly to i32.
9662 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9664 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9665 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9666 EltVT == MVT::i16) {
9667 // Either not inserting from the low element of the input or the input
9668 // element size is too small to use VZEXT_MOVL to clear the high bits.
9672 if (!IsV1Zeroable) {
9673 // If V1 can't be treated as a zero vector we have fewer options to lower
9674 // this. We can't support integer vectors or non-zero targets cheaply, and
9675 // the V1 elements can't be permuted in any way.
9676 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9677 if (!VT.isFloatingPoint() || V2Index != 0)
9679 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9680 V1Mask[V2Index] = -1;
9681 if (!isNoopShuffleMask(V1Mask))
9683 // This is essentially a special case blend operation, but if we have
9684 // general purpose blend operations, they are always faster. Bail and let
9685 // the rest of the lowering handle these as blends.
9686 if (Subtarget.hasSSE41())
9689 // Otherwise, use MOVSD or MOVSS.
9690 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9691 "Only two types of floating point element types to handle!");
9692 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9696 // This lowering only works for the low element with floating point vectors.
9697 if (VT.isFloatingPoint() && V2Index != 0)
9700 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9702 V2 = DAG.getBitcast(VT, V2);
9705 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9706 // the desired position. Otherwise it is more efficient to do a vector
9707 // shift left. We know that we can do a vector shift left because all
9708 // the inputs are zero.
9709 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9710 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9711 V2Shuffle[V2Index] = 0;
9712 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9714 V2 = DAG.getBitcast(MVT::v16i8, V2);
9716 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9717 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9718 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9719 DAG.getDataLayout(), VT)));
9720 V2 = DAG.getBitcast(VT, V2);
9726 /// Try to lower broadcast of a single - truncated - integer element,
9727 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9729 /// This assumes we have AVX2.
9730 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9731 SDValue V0, int BroadcastIdx,
9732 const X86Subtarget &Subtarget,
9733 SelectionDAG &DAG) {
9734 assert(Subtarget.hasAVX2() &&
9735 "We can only lower integer broadcasts with AVX2!");
9737 EVT EltVT = VT.getVectorElementType();
9738 EVT V0VT = V0.getValueType();
9740 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9741 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9743 EVT V0EltVT = V0VT.getVectorElementType();
9744 if (!V0EltVT.isInteger())
9747 const unsigned EltSize = EltVT.getSizeInBits();
9748 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9750 // This is only a truncation if the original element type is larger.
9751 if (V0EltSize <= EltSize)
9754 assert(((V0EltSize % EltSize) == 0) &&
9755 "Scalar type sizes must all be powers of 2 on x86!");
9757 const unsigned V0Opc = V0.getOpcode();
9758 const unsigned Scale = V0EltSize / EltSize;
9759 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9761 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9762 V0Opc != ISD::BUILD_VECTOR)
9765 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9767 // If we're extracting non-least-significant bits, shift so we can truncate.
9768 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9769 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9770 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9771 if (const int OffsetIdx = BroadcastIdx % Scale)
9772 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9773 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9775 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9776 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9779 /// \brief Try to lower broadcast of a single element.
9781 /// For convenience, this code also bundles all of the subtarget feature set
9782 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9783 /// a convenient way to factor it out.
9784 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9785 SDValue V1, SDValue V2,
9787 const X86Subtarget &Subtarget,
9788 SelectionDAG &DAG) {
9789 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9790 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9791 (Subtarget.hasAVX2() && VT.isInteger())))
9794 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9795 // we can only broadcast from a register with AVX2.
9796 unsigned NumElts = Mask.size();
9797 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9798 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9800 // Check that the mask is a broadcast.
9801 int BroadcastIdx = -1;
9802 for (int i = 0; i != (int)NumElts; ++i) {
9803 SmallVector<int, 8> BroadcastMask(NumElts, i);
9804 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9810 if (BroadcastIdx < 0)
9812 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9813 "a sorted mask where the broadcast "
9816 // Go up the chain of (vector) values to find a scalar load that we can
9817 // combine with the broadcast.
9820 switch (V.getOpcode()) {
9821 case ISD::BITCAST: {
9822 SDValue VSrc = V.getOperand(0);
9823 MVT SrcVT = VSrc.getSimpleValueType();
9824 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9829 case ISD::CONCAT_VECTORS: {
9830 int OperandSize = Mask.size() / V.getNumOperands();
9831 V = V.getOperand(BroadcastIdx / OperandSize);
9832 BroadcastIdx %= OperandSize;
9835 case ISD::INSERT_SUBVECTOR: {
9836 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9837 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9841 int BeginIdx = (int)ConstantIdx->getZExtValue();
9843 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9844 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9845 BroadcastIdx -= BeginIdx;
9856 // Check if this is a broadcast of a scalar. We special case lowering
9857 // for scalars so that we can more effectively fold with loads.
9858 // First, look through bitcast: if the original value has a larger element
9859 // type than the shuffle, the broadcast element is in essence truncated.
9860 // Make that explicit to ease folding.
9861 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
9862 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
9863 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9864 return TruncBroadcast;
9866 MVT BroadcastVT = VT;
9868 // Peek through any bitcast (only useful for loads).
9869 SDValue BC = peekThroughBitcasts(V);
9871 // Also check the simpler case, where we can directly reuse the scalar.
9872 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9873 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
9874 V = V.getOperand(BroadcastIdx);
9876 // If we can't broadcast from a register, check that the input is a load.
9877 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
9879 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
9880 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9881 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
9882 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9883 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
9886 // If we are broadcasting a load that is only used by the shuffle
9887 // then we can reduce the vector load to the broadcasted scalar load.
9888 LoadSDNode *Ld = cast<LoadSDNode>(BC);
9889 SDValue BaseAddr = Ld->getOperand(1);
9890 EVT SVT = BroadcastVT.getScalarType();
9891 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
9892 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
9893 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
9894 DAG.getMachineFunction().getMachineMemOperand(
9895 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
9897 // Make sure the newly-created LOAD is in the same position as Ld in
9898 // terms of dependency. We create a TokenFactor for Ld and V,
9899 // and update uses of Ld's output chain to use the TokenFactor.
9900 if (Ld->hasAnyUseOfValue(1)) {
9901 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9902 SDValue(Ld, 1), SDValue(V.getNode(), 1));
9903 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
9904 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
9905 SDValue(V.getNode(), 1));
9907 } else if (!BroadcastFromReg) {
9908 // We can't broadcast from a vector register.
9910 } else if (BroadcastIdx != 0) {
9911 // We can only broadcast from the zero-element of a vector register,
9912 // but it can be advantageous to broadcast from the zero-element of a
9914 if (!VT.is256BitVector() && !VT.is512BitVector())
9917 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
9918 if (VT == MVT::v4f64 || VT == MVT::v4i64)
9921 // Only broadcast the zero-element of a 128-bit subvector.
9922 unsigned EltSize = VT.getScalarSizeInBits();
9923 if (((BroadcastIdx * EltSize) % 128) != 0)
9926 // The shuffle input might have been a bitcast we looked through; look at
9927 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
9928 // later bitcast it to BroadcastVT.
9929 MVT SrcVT = V.getSimpleValueType();
9930 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9931 "Unexpected vector element size");
9932 assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
9933 "Unexpected vector size");
9935 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
9936 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
9937 DAG.getIntPtrConstant(BroadcastIdx, DL));
9940 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
9941 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
9942 DAG.getBitcast(MVT::f64, V));
9944 // Bitcast back to the same scalar type as BroadcastVT.
9945 MVT SrcVT = V.getSimpleValueType();
9946 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
9947 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9948 "Unexpected vector element size");
9949 if (SrcVT.isVector()) {
9950 unsigned NumSrcElts = SrcVT.getVectorNumElements();
9951 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
9953 SrcVT = BroadcastVT.getScalarType();
9955 V = DAG.getBitcast(SrcVT, V);
9958 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9959 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
9960 V = DAG.getBitcast(MVT::f64, V);
9961 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
9962 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
9965 // We only support broadcasting from 128-bit vectors to minimize the
9966 // number of patterns we need to deal with in isel. So extract down to
9968 if (SrcVT.getSizeInBits() > 128)
9969 V = extract128BitVector(V, 0, DAG, DL);
9971 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
9974 // Check for whether we can use INSERTPS to perform the shuffle. We only use
9975 // INSERTPS when the V1 elements are already in the correct locations
9976 // because otherwise we can just always use two SHUFPS instructions which
9977 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
9978 // perform INSERTPS if a single V1 element is out of place and all V2
9979 // elements are zeroable.
9980 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
9981 unsigned &InsertPSMask,
9982 const APInt &Zeroable,
9984 SelectionDAG &DAG) {
9985 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
9986 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
9987 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9989 // Attempt to match INSERTPS with one element from VA or VB being
9990 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
9992 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
9993 ArrayRef<int> CandidateMask) {
9995 int VADstIndex = -1;
9996 int VBDstIndex = -1;
9997 bool VAUsedInPlace = false;
9999 for (int i = 0; i < 4; ++i) {
10000 // Synthesize a zero mask from the zeroable elements (includes undefs).
10006 // Flag if we use any VA inputs in place.
10007 if (i == CandidateMask[i]) {
10008 VAUsedInPlace = true;
10012 // We can only insert a single non-zeroable element.
10013 if (VADstIndex >= 0 || VBDstIndex >= 0)
10016 if (CandidateMask[i] < 4) {
10017 // VA input out of place for insertion.
10020 // VB input for insertion.
10025 // Don't bother if we have no (non-zeroable) element for insertion.
10026 if (VADstIndex < 0 && VBDstIndex < 0)
10029 // Determine element insertion src/dst indices. The src index is from the
10030 // start of the inserted vector, not the start of the concatenated vector.
10031 unsigned VBSrcIndex = 0;
10032 if (VADstIndex >= 0) {
10033 // If we have a VA input out of place, we use VA as the V2 element
10034 // insertion and don't use the original V2 at all.
10035 VBSrcIndex = CandidateMask[VADstIndex];
10036 VBDstIndex = VADstIndex;
10039 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10042 // If no V1 inputs are used in place, then the result is created only from
10043 // the zero mask and the V2 insertion - so remove V1 dependency.
10044 if (!VAUsedInPlace)
10045 VA = DAG.getUNDEF(MVT::v4f32);
10047 // Update V1, V2 and InsertPSMask accordingly.
10051 // Insert the V2 element into the desired position.
10052 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10053 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10057 if (matchAsInsertPS(V1, V2, Mask))
10060 // Commute and try again.
10061 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10062 ShuffleVectorSDNode::commuteMask(CommutedMask);
10063 if (matchAsInsertPS(V2, V1, CommutedMask))
10069 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10070 SDValue V2, ArrayRef<int> Mask,
10071 const APInt &Zeroable,
10072 SelectionDAG &DAG) {
10073 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10074 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10076 // Attempt to match the insertps pattern.
10077 unsigned InsertPSMask;
10078 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10081 // Insert the V2 element into the desired position.
10082 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10083 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10086 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10087 /// UNPCK instruction.
10089 /// This specifically targets cases where we end up with alternating between
10090 /// the two inputs, and so can permute them into something that feeds a single
10091 /// UNPCK instruction. Note that this routine only targets integer vectors
10092 /// because for floating point vectors we have a generalized SHUFPS lowering
10093 /// strategy that handles everything that doesn't *exactly* match an unpack,
10094 /// making this clever lowering unnecessary.
10095 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10096 SDValue V1, SDValue V2,
10097 ArrayRef<int> Mask,
10098 SelectionDAG &DAG) {
10099 assert(!VT.isFloatingPoint() &&
10100 "This routine only supports integer vectors.");
10101 assert(VT.is128BitVector() &&
10102 "This routine only works on 128-bit vectors.");
10103 assert(!V2.isUndef() &&
10104 "This routine should only be used when blending two inputs.");
10105 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10107 int Size = Mask.size();
10110 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10112 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10114 bool UnpackLo = NumLoInputs >= NumHiInputs;
10116 auto TryUnpack = [&](int ScalarSize, int Scale) {
10117 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10118 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10120 for (int i = 0; i < Size; ++i) {
10124 // Each element of the unpack contains Scale elements from this mask.
10125 int UnpackIdx = i / Scale;
10127 // We only handle the case where V1 feeds the first slots of the unpack.
10128 // We rely on canonicalization to ensure this is the case.
10129 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10132 // Setup the mask for this input. The indexing is tricky as we have to
10133 // handle the unpack stride.
10134 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10135 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10139 // If we will have to shuffle both inputs to use the unpack, check whether
10140 // we can just unpack first and shuffle the result. If so, skip this unpack.
10141 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10142 !isNoopShuffleMask(V2Mask))
10145 // Shuffle the inputs into place.
10146 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10147 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10149 // Cast the inputs to the type we will use to unpack them.
10150 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10151 V1 = DAG.getBitcast(UnpackVT, V1);
10152 V2 = DAG.getBitcast(UnpackVT, V2);
10154 // Unpack the inputs and cast the result back to the desired type.
10155 return DAG.getBitcast(
10156 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10157 UnpackVT, V1, V2));
10160 // We try each unpack from the largest to the smallest to try and find one
10161 // that fits this mask.
10162 int OrigScalarSize = VT.getScalarSizeInBits();
10163 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10164 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10167 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10169 if (NumLoInputs == 0 || NumHiInputs == 0) {
10170 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10171 "We have to have *some* inputs!");
10172 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10174 // FIXME: We could consider the total complexity of the permute of each
10175 // possible unpacking. Or at the least we should consider how many
10176 // half-crossings are created.
10177 // FIXME: We could consider commuting the unpacks.
10179 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10180 for (int i = 0; i < Size; ++i) {
10184 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10187 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10189 return DAG.getVectorShuffle(
10190 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10192 DAG.getUNDEF(VT), PermMask);
10198 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10200 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10201 /// support for floating point shuffles but not integer shuffles. These
10202 /// instructions will incur a domain crossing penalty on some chips though so
10203 /// it is better to avoid lowering through this for integer vectors where
10205 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10206 const APInt &Zeroable,
10207 SDValue V1, SDValue V2,
10208 const X86Subtarget &Subtarget,
10209 SelectionDAG &DAG) {
10210 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10211 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10212 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10214 if (V2.isUndef()) {
10215 // Check for being able to broadcast a single element.
10216 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10217 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10220 // Straight shuffle of a single input vector. Simulate this by using the
10221 // single input as both of the "inputs" to this instruction..
10222 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10224 if (Subtarget.hasAVX()) {
10225 // If we have AVX, we can use VPERMILPS which will allow folding a load
10226 // into the shuffle.
10227 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10228 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10231 return DAG.getNode(
10232 X86ISD::SHUFP, DL, MVT::v2f64,
10233 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10234 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10235 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10237 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10238 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10240 // If we have a single input, insert that into V1 if we can do so cheaply.
10241 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10242 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10243 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10245 // Try inverting the insertion since for v2 masks it is easy to do and we
10246 // can't reliably sort the mask one way or the other.
10247 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10248 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10249 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10250 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10254 // Try to use one of the special instruction patterns to handle two common
10255 // blend patterns if a zero-blend above didn't work.
10256 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10257 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10258 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10259 // We can either use a special instruction to load over the low double or
10260 // to move just the low double.
10261 return DAG.getNode(
10262 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10263 DL, MVT::v2f64, V2,
10264 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10266 if (Subtarget.hasSSE41())
10267 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10268 Zeroable, Subtarget, DAG))
10271 // Use dedicated unpack instructions for masks that match their pattern.
10273 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10276 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10277 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10278 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10281 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10283 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10284 /// the integer unit to minimize domain crossing penalties. However, for blends
10285 /// it falls back to the floating point shuffle operation with appropriate bit
10287 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10288 const APInt &Zeroable,
10289 SDValue V1, SDValue V2,
10290 const X86Subtarget &Subtarget,
10291 SelectionDAG &DAG) {
10292 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10293 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10294 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10296 if (V2.isUndef()) {
10297 // Check for being able to broadcast a single element.
10298 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10299 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10302 // Straight shuffle of a single input vector. For everything from SSE2
10303 // onward this has a single fast instruction with no scary immediates.
10304 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10305 V1 = DAG.getBitcast(MVT::v4i32, V1);
10306 int WidenedMask[4] = {
10307 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10308 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10309 return DAG.getBitcast(
10311 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10312 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10314 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10315 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10316 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10317 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10319 // If we have a blend of two same-type PACKUS operations and the blend aligns
10320 // with the low and high halves, we can just merge the PACKUS operations.
10321 // This is particularly important as it lets us merge shuffles that this
10322 // routine itself creates.
10323 auto GetPackNode = [](SDValue V) {
10324 V = peekThroughBitcasts(V);
10325 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10327 if (SDValue V1Pack = GetPackNode(V1))
10328 if (SDValue V2Pack = GetPackNode(V2)) {
10329 EVT PackVT = V1Pack.getValueType();
10330 if (PackVT == V2Pack.getValueType())
10331 return DAG.getBitcast(MVT::v2i64,
10332 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10333 Mask[0] == 0 ? V1Pack.getOperand(0)
10334 : V1Pack.getOperand(1),
10335 Mask[1] == 2 ? V2Pack.getOperand(0)
10336 : V2Pack.getOperand(1)));
10339 // Try to use shift instructions.
10340 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10341 Zeroable, Subtarget, DAG))
10344 // When loading a scalar and then shuffling it into a vector we can often do
10345 // the insertion cheaply.
10346 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10347 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10349 // Try inverting the insertion since for v2 masks it is easy to do and we
10350 // can't reliably sort the mask one way or the other.
10351 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10352 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10353 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10356 // We have different paths for blend lowering, but they all must use the
10357 // *exact* same predicate.
10358 bool IsBlendSupported = Subtarget.hasSSE41();
10359 if (IsBlendSupported)
10360 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10361 Zeroable, Subtarget, DAG))
10364 // Use dedicated unpack instructions for masks that match their pattern.
10366 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10369 // Try to use byte rotation instructions.
10370 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10371 if (Subtarget.hasSSSE3())
10372 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10373 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10376 // If we have direct support for blends, we should lower by decomposing into
10377 // a permute. That will be faster than the domain cross.
10378 if (IsBlendSupported)
10379 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10382 // We implement this with SHUFPD which is pretty lame because it will likely
10383 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10384 // However, all the alternatives are still more cycles and newer chips don't
10385 // have this problem. It would be really nice if x86 had better shuffles here.
10386 V1 = DAG.getBitcast(MVT::v2f64, V1);
10387 V2 = DAG.getBitcast(MVT::v2f64, V2);
10388 return DAG.getBitcast(MVT::v2i64,
10389 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10392 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10394 /// This is used to disable more specialized lowerings when the shufps lowering
10395 /// will happen to be efficient.
10396 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10397 // This routine only handles 128-bit shufps.
10398 assert(Mask.size() == 4 && "Unsupported mask size!");
10399 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10400 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10401 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10402 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10404 // To lower with a single SHUFPS we need to have the low half and high half
10405 // each requiring a single input.
10406 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10408 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10414 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10416 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10417 /// It makes no assumptions about whether this is the *best* lowering, it simply
10419 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10420 ArrayRef<int> Mask, SDValue V1,
10421 SDValue V2, SelectionDAG &DAG) {
10422 SDValue LowV = V1, HighV = V2;
10423 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10425 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10427 if (NumV2Elements == 1) {
10428 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10430 // Compute the index adjacent to V2Index and in the same half by toggling
10432 int V2AdjIndex = V2Index ^ 1;
10434 if (Mask[V2AdjIndex] < 0) {
10435 // Handles all the cases where we have a single V2 element and an undef.
10436 // This will only ever happen in the high lanes because we commute the
10437 // vector otherwise.
10439 std::swap(LowV, HighV);
10440 NewMask[V2Index] -= 4;
10442 // Handle the case where the V2 element ends up adjacent to a V1 element.
10443 // To make this work, blend them together as the first step.
10444 int V1Index = V2AdjIndex;
10445 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10446 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10447 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10449 // Now proceed to reconstruct the final blend as we have the necessary
10450 // high or low half formed.
10457 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10458 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10460 } else if (NumV2Elements == 2) {
10461 if (Mask[0] < 4 && Mask[1] < 4) {
10462 // Handle the easy case where we have V1 in the low lanes and V2 in the
10466 } else if (Mask[2] < 4 && Mask[3] < 4) {
10467 // We also handle the reversed case because this utility may get called
10468 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10469 // arrange things in the right direction.
10475 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10476 // trying to place elements directly, just blend them and set up the final
10477 // shuffle to place them.
10479 // The first two blend mask elements are for V1, the second two are for
10481 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10482 Mask[2] < 4 ? Mask[2] : Mask[3],
10483 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10484 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10485 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10486 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10488 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10491 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10492 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10493 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10494 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10497 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10498 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10501 /// \brief Lower 4-lane 32-bit floating point shuffles.
10503 /// Uses instructions exclusively from the floating point unit to minimize
10504 /// domain crossing penalties, as these are sufficient to implement all v4f32
10506 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10507 const APInt &Zeroable,
10508 SDValue V1, SDValue V2,
10509 const X86Subtarget &Subtarget,
10510 SelectionDAG &DAG) {
10511 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10512 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10513 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10515 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10517 if (NumV2Elements == 0) {
10518 // Check for being able to broadcast a single element.
10519 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10520 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10523 // Use even/odd duplicate instructions for masks that match their pattern.
10524 if (Subtarget.hasSSE3()) {
10525 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10526 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10527 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10528 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10531 if (Subtarget.hasAVX()) {
10532 // If we have AVX, we can use VPERMILPS which will allow folding a load
10533 // into the shuffle.
10534 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10535 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10538 // Otherwise, use a straight shuffle of a single input vector. We pass the
10539 // input vector to both operands to simulate this with a SHUFPS.
10540 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10541 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10544 // There are special ways we can lower some single-element blends. However, we
10545 // have custom ways we can lower more complex single-element blends below that
10546 // we defer to if both this and BLENDPS fail to match, so restrict this to
10547 // when the V2 input is targeting element 0 of the mask -- that is the fast
10549 if (NumV2Elements == 1 && Mask[0] >= 4)
10550 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10551 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10554 if (Subtarget.hasSSE41()) {
10555 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10556 Zeroable, Subtarget, DAG))
10559 // Use INSERTPS if we can complete the shuffle efficiently.
10561 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10564 if (!isSingleSHUFPSMask(Mask))
10565 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10566 DL, MVT::v4f32, V1, V2, Mask, DAG))
10570 // Use low/high mov instructions.
10571 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10572 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10573 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10574 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10576 // Use dedicated unpack instructions for masks that match their pattern.
10578 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10581 // Otherwise fall back to a SHUFPS lowering strategy.
10582 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10585 /// \brief Lower 4-lane i32 vector shuffles.
10587 /// We try to handle these with integer-domain shuffles where we can, but for
10588 /// blends we use the floating point domain blend instructions.
10589 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10590 const APInt &Zeroable,
10591 SDValue V1, SDValue V2,
10592 const X86Subtarget &Subtarget,
10593 SelectionDAG &DAG) {
10594 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10595 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10596 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10598 // Whenever we can lower this as a zext, that instruction is strictly faster
10599 // than any alternative. It also allows us to fold memory operands into the
10600 // shuffle in many cases.
10601 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10602 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10605 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10607 if (NumV2Elements == 0) {
10608 // Check for being able to broadcast a single element.
10609 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10610 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10613 // Straight shuffle of a single input vector. For everything from SSE2
10614 // onward this has a single fast instruction with no scary immediates.
10615 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10616 // but we aren't actually going to use the UNPCK instruction because doing
10617 // so prevents folding a load into this instruction or making a copy.
10618 const int UnpackLoMask[] = {0, 0, 1, 1};
10619 const int UnpackHiMask[] = {2, 2, 3, 3};
10620 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10621 Mask = UnpackLoMask;
10622 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10623 Mask = UnpackHiMask;
10625 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10626 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10629 // Try to use shift instructions.
10630 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10631 Zeroable, Subtarget, DAG))
10634 // There are special ways we can lower some single-element blends.
10635 if (NumV2Elements == 1)
10636 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10637 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10640 // We have different paths for blend lowering, but they all must use the
10641 // *exact* same predicate.
10642 bool IsBlendSupported = Subtarget.hasSSE41();
10643 if (IsBlendSupported)
10644 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10645 Zeroable, Subtarget, DAG))
10648 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10652 // Use dedicated unpack instructions for masks that match their pattern.
10654 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10657 // Try to use byte rotation instructions.
10658 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10659 if (Subtarget.hasSSSE3())
10660 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10661 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10664 // Assume that a single SHUFPS is faster than an alternative sequence of
10665 // multiple instructions (even if the CPU has a domain penalty).
10666 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10667 if (!isSingleSHUFPSMask(Mask)) {
10668 // If we have direct support for blends, we should lower by decomposing into
10669 // a permute. That will be faster than the domain cross.
10670 if (IsBlendSupported)
10671 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10674 // Try to lower by permuting the inputs into an unpack instruction.
10675 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10676 DL, MVT::v4i32, V1, V2, Mask, DAG))
10680 // We implement this with SHUFPS because it can blend from two vectors.
10681 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10682 // up the inputs, bypassing domain shift penalties that we would incur if we
10683 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10685 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10686 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10687 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10688 return DAG.getBitcast(MVT::v4i32, ShufPS);
10691 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10692 /// shuffle lowering, and the most complex part.
10694 /// The lowering strategy is to try to form pairs of input lanes which are
10695 /// targeted at the same half of the final vector, and then use a dword shuffle
10696 /// to place them onto the right half, and finally unpack the paired lanes into
10697 /// their final position.
10699 /// The exact breakdown of how to form these dword pairs and align them on the
10700 /// correct sides is really tricky. See the comments within the function for
10701 /// more of the details.
10703 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10704 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10705 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10706 /// vector, form the analogous 128-bit 8-element Mask.
10707 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10708 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10709 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10710 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10711 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10713 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10714 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10715 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10717 SmallVector<int, 4> LoInputs;
10718 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10719 std::sort(LoInputs.begin(), LoInputs.end());
10720 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10721 SmallVector<int, 4> HiInputs;
10722 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10723 std::sort(HiInputs.begin(), HiInputs.end());
10724 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10726 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10727 int NumHToL = LoInputs.size() - NumLToL;
10729 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10730 int NumHToH = HiInputs.size() - NumLToH;
10731 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10732 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10733 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10734 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10736 // If we are splatting two values from one half - one to each half, then
10737 // we can shuffle that half so each is splatted to a dword, then splat those
10738 // to their respective halves.
10739 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10741 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10742 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10743 V = DAG.getNode(ShufWOp, DL, VT, V,
10744 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10745 V = DAG.getBitcast(PSHUFDVT, V);
10746 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10747 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10748 return DAG.getBitcast(VT, V);
10751 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10752 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10753 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10754 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10756 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10757 // such inputs we can swap two of the dwords across the half mark and end up
10758 // with <=2 inputs to each half in each half. Once there, we can fall through
10759 // to the generic code below. For example:
10761 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10762 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10764 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10765 // and an existing 2-into-2 on the other half. In this case we may have to
10766 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10767 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10768 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10769 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10770 // half than the one we target for fixing) will be fixed when we re-enter this
10771 // path. We will also combine away any sequence of PSHUFD instructions that
10772 // result into a single instruction. Here is an example of the tricky case:
10774 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10775 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10777 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10779 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10780 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10782 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10783 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10785 // The result is fine to be handled by the generic logic.
10786 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10787 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10788 int AOffset, int BOffset) {
10789 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10790 "Must call this with A having 3 or 1 inputs from the A half.");
10791 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10792 "Must call this with B having 1 or 3 inputs from the B half.");
10793 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10794 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10796 bool ThreeAInputs = AToAInputs.size() == 3;
10798 // Compute the index of dword with only one word among the three inputs in
10799 // a half by taking the sum of the half with three inputs and subtracting
10800 // the sum of the actual three inputs. The difference is the remaining
10802 int ADWord, BDWord;
10803 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10804 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10805 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10806 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10807 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10808 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10809 int TripleNonInputIdx =
10810 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10811 TripleDWord = TripleNonInputIdx / 2;
10813 // We use xor with one to compute the adjacent DWord to whichever one the
10815 OneInputDWord = (OneInput / 2) ^ 1;
10817 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10818 // and BToA inputs. If there is also such a problem with the BToB and AToB
10819 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10820 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10821 // is essential that we don't *create* a 3<-1 as then we might oscillate.
10822 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10823 // Compute how many inputs will be flipped by swapping these DWords. We
10825 // to balance this to ensure we don't form a 3-1 shuffle in the other
10827 int NumFlippedAToBInputs =
10828 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10829 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10830 int NumFlippedBToBInputs =
10831 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10832 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10833 if ((NumFlippedAToBInputs == 1 &&
10834 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
10835 (NumFlippedBToBInputs == 1 &&
10836 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
10837 // We choose whether to fix the A half or B half based on whether that
10838 // half has zero flipped inputs. At zero, we may not be able to fix it
10839 // with that half. We also bias towards fixing the B half because that
10840 // will more commonly be the high half, and we have to bias one way.
10841 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10842 ArrayRef<int> Inputs) {
10843 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10844 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10845 // Determine whether the free index is in the flipped dword or the
10846 // unflipped dword based on where the pinned index is. We use this bit
10847 // in an xor to conditionally select the adjacent dword.
10848 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10849 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10850 if (IsFixIdxInput == IsFixFreeIdxInput)
10852 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10853 assert(IsFixIdxInput != IsFixFreeIdxInput &&
10854 "We need to be changing the number of flipped inputs!");
10855 int PSHUFHalfMask[] = {0, 1, 2, 3};
10856 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10857 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10859 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10861 for (int &M : Mask)
10862 if (M >= 0 && M == FixIdx)
10864 else if (M >= 0 && M == FixFreeIdx)
10867 if (NumFlippedBToBInputs != 0) {
10869 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
10870 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10872 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10873 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
10874 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10879 int PSHUFDMask[] = {0, 1, 2, 3};
10880 PSHUFDMask[ADWord] = BDWord;
10881 PSHUFDMask[BDWord] = ADWord;
10882 V = DAG.getBitcast(
10884 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10885 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10887 // Adjust the mask to match the new locations of A and B.
10888 for (int &M : Mask)
10889 if (M >= 0 && M/2 == ADWord)
10890 M = 2 * BDWord + M % 2;
10891 else if (M >= 0 && M/2 == BDWord)
10892 M = 2 * ADWord + M % 2;
10894 // Recurse back into this routine to re-compute state now that this isn't
10895 // a 3 and 1 problem.
10896 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
10899 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
10900 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
10901 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
10902 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
10904 // At this point there are at most two inputs to the low and high halves from
10905 // each half. That means the inputs can always be grouped into dwords and
10906 // those dwords can then be moved to the correct half with a dword shuffle.
10907 // We use at most one low and one high word shuffle to collect these paired
10908 // inputs into dwords, and finally a dword shuffle to place them.
10909 int PSHUFLMask[4] = {-1, -1, -1, -1};
10910 int PSHUFHMask[4] = {-1, -1, -1, -1};
10911 int PSHUFDMask[4] = {-1, -1, -1, -1};
10913 // First fix the masks for all the inputs that are staying in their
10914 // original halves. This will then dictate the targets of the cross-half
10916 auto fixInPlaceInputs =
10917 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
10918 MutableArrayRef<int> SourceHalfMask,
10919 MutableArrayRef<int> HalfMask, int HalfOffset) {
10920 if (InPlaceInputs.empty())
10922 if (InPlaceInputs.size() == 1) {
10923 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10924 InPlaceInputs[0] - HalfOffset;
10925 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
10928 if (IncomingInputs.empty()) {
10929 // Just fix all of the in place inputs.
10930 for (int Input : InPlaceInputs) {
10931 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
10932 PSHUFDMask[Input / 2] = Input / 2;
10937 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
10938 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10939 InPlaceInputs[0] - HalfOffset;
10940 // Put the second input next to the first so that they are packed into
10941 // a dword. We find the adjacent index by toggling the low bit.
10942 int AdjIndex = InPlaceInputs[0] ^ 1;
10943 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
10944 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
10945 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
10947 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
10948 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
10950 // Now gather the cross-half inputs and place them into a free dword of
10951 // their target half.
10952 // FIXME: This operation could almost certainly be simplified dramatically to
10953 // look more like the 3-1 fixing operation.
10954 auto moveInputsToRightHalf = [&PSHUFDMask](
10955 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
10956 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
10957 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
10959 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
10960 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
10962 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
10964 int LowWord = Word & ~1;
10965 int HighWord = Word | 1;
10966 return isWordClobbered(SourceHalfMask, LowWord) ||
10967 isWordClobbered(SourceHalfMask, HighWord);
10970 if (IncomingInputs.empty())
10973 if (ExistingInputs.empty()) {
10974 // Map any dwords with inputs from them into the right half.
10975 for (int Input : IncomingInputs) {
10976 // If the source half mask maps over the inputs, turn those into
10977 // swaps and use the swapped lane.
10978 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
10979 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
10980 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
10981 Input - SourceOffset;
10982 // We have to swap the uses in our half mask in one sweep.
10983 for (int &M : HalfMask)
10984 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
10986 else if (M == Input)
10987 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10989 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
10990 Input - SourceOffset &&
10991 "Previous placement doesn't match!");
10993 // Note that this correctly re-maps both when we do a swap and when
10994 // we observe the other side of the swap above. We rely on that to
10995 // avoid swapping the members of the input list directly.
10996 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10999 // Map the input's dword into the correct half.
11000 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11001 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11003 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11005 "Previous placement doesn't match!");
11008 // And just directly shift any other-half mask elements to be same-half
11009 // as we will have mirrored the dword containing the element into the
11010 // same position within that half.
11011 for (int &M : HalfMask)
11012 if (M >= SourceOffset && M < SourceOffset + 4) {
11013 M = M - SourceOffset + DestOffset;
11014 assert(M >= 0 && "This should never wrap below zero!");
11019 // Ensure we have the input in a viable dword of its current half. This
11020 // is particularly tricky because the original position may be clobbered
11021 // by inputs being moved and *staying* in that half.
11022 if (IncomingInputs.size() == 1) {
11023 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11024 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11026 SourceHalfMask[InputFixed - SourceOffset] =
11027 IncomingInputs[0] - SourceOffset;
11028 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11030 IncomingInputs[0] = InputFixed;
11032 } else if (IncomingInputs.size() == 2) {
11033 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11034 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11035 // We have two non-adjacent or clobbered inputs we need to extract from
11036 // the source half. To do this, we need to map them into some adjacent
11037 // dword slot in the source mask.
11038 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11039 IncomingInputs[1] - SourceOffset};
11041 // If there is a free slot in the source half mask adjacent to one of
11042 // the inputs, place the other input in it. We use (Index XOR 1) to
11043 // compute an adjacent index.
11044 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11045 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11046 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11047 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11048 InputsFixed[1] = InputsFixed[0] ^ 1;
11049 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11050 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11051 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11052 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11053 InputsFixed[0] = InputsFixed[1] ^ 1;
11054 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11055 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11056 // The two inputs are in the same DWord but it is clobbered and the
11057 // adjacent DWord isn't used at all. Move both inputs to the free
11059 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11060 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11061 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11062 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11064 // The only way we hit this point is if there is no clobbering
11065 // (because there are no off-half inputs to this half) and there is no
11066 // free slot adjacent to one of the inputs. In this case, we have to
11067 // swap an input with a non-input.
11068 for (int i = 0; i < 4; ++i)
11069 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11070 "We can't handle any clobbers here!");
11071 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11072 "Cannot have adjacent inputs here!");
11074 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11075 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11077 // We also have to update the final source mask in this case because
11078 // it may need to undo the above swap.
11079 for (int &M : FinalSourceHalfMask)
11080 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11081 M = InputsFixed[1] + SourceOffset;
11082 else if (M == InputsFixed[1] + SourceOffset)
11083 M = (InputsFixed[0] ^ 1) + SourceOffset;
11085 InputsFixed[1] = InputsFixed[0] ^ 1;
11088 // Point everything at the fixed inputs.
11089 for (int &M : HalfMask)
11090 if (M == IncomingInputs[0])
11091 M = InputsFixed[0] + SourceOffset;
11092 else if (M == IncomingInputs[1])
11093 M = InputsFixed[1] + SourceOffset;
11095 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11096 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11099 llvm_unreachable("Unhandled input size!");
11102 // Now hoist the DWord down to the right half.
11103 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11104 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11105 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11106 for (int &M : HalfMask)
11107 for (int Input : IncomingInputs)
11109 M = FreeDWord * 2 + Input % 2;
11111 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11112 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11113 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11114 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11116 // Now enact all the shuffles we've computed to move the inputs into their
11118 if (!isNoopShuffleMask(PSHUFLMask))
11119 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11120 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11121 if (!isNoopShuffleMask(PSHUFHMask))
11122 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11123 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11124 if (!isNoopShuffleMask(PSHUFDMask))
11125 V = DAG.getBitcast(
11127 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11128 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11130 // At this point, each half should contain all its inputs, and we can then
11131 // just shuffle them into their final position.
11132 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11133 "Failed to lift all the high half inputs to the low mask!");
11134 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11135 "Failed to lift all the low half inputs to the high mask!");
11137 // Do a half shuffle for the low mask.
11138 if (!isNoopShuffleMask(LoMask))
11139 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11140 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11142 // Do a half shuffle with the high mask after shifting its values down.
11143 for (int &M : HiMask)
11146 if (!isNoopShuffleMask(HiMask))
11147 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11148 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11153 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11154 /// blend if only one input is used.
11155 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11156 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11157 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11159 SDValue V1Mask[16];
11160 SDValue V2Mask[16];
11164 int Size = Mask.size();
11165 int Scale = 16 / Size;
11166 for (int i = 0; i < 16; ++i) {
11167 if (Mask[i / Scale] < 0) {
11168 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11170 const int ZeroMask = 0x80;
11171 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11173 int V2Idx = Mask[i / Scale] < Size
11175 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11176 if (Zeroable[i / Scale])
11177 V1Idx = V2Idx = ZeroMask;
11178 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11179 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11180 V1InUse |= (ZeroMask != V1Idx);
11181 V2InUse |= (ZeroMask != V2Idx);
11186 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11187 DAG.getBitcast(MVT::v16i8, V1),
11188 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11190 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11191 DAG.getBitcast(MVT::v16i8, V2),
11192 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11194 // If we need shuffled inputs from both, blend the two.
11196 if (V1InUse && V2InUse)
11197 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11199 V = V1InUse ? V1 : V2;
11201 // Cast the result back to the correct type.
11202 return DAG.getBitcast(VT, V);
11205 /// \brief Generic lowering of 8-lane i16 shuffles.
11207 /// This handles both single-input shuffles and combined shuffle/blends with
11208 /// two inputs. The single input shuffles are immediately delegated to
11209 /// a dedicated lowering routine.
11211 /// The blends are lowered in one of three fundamental ways. If there are few
11212 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11213 /// of the input is significantly cheaper when lowered as an interleaving of
11214 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11215 /// halves of the inputs separately (making them have relatively few inputs)
11216 /// and then concatenate them.
11217 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11218 const APInt &Zeroable,
11219 SDValue V1, SDValue V2,
11220 const X86Subtarget &Subtarget,
11221 SelectionDAG &DAG) {
11222 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11223 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11224 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11226 // Whenever we can lower this as a zext, that instruction is strictly faster
11227 // than any alternative.
11228 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11229 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11232 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11234 if (NumV2Inputs == 0) {
11235 // Check for being able to broadcast a single element.
11236 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11237 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11240 // Try to use shift instructions.
11241 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11242 Zeroable, Subtarget, DAG))
11245 // Use dedicated unpack instructions for masks that match their pattern.
11247 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11250 // Try to use byte rotation instructions.
11251 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11252 Mask, Subtarget, DAG))
11255 // Make a copy of the mask so it can be modified.
11256 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11257 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11258 MutableMask, Subtarget,
11262 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11263 "All single-input shuffles should be canonicalized to be V1-input "
11266 // Try to use shift instructions.
11267 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11268 Zeroable, Subtarget, DAG))
11271 // See if we can use SSE4A Extraction / Insertion.
11272 if (Subtarget.hasSSE4A())
11273 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11277 // There are special ways we can lower some single-element blends.
11278 if (NumV2Inputs == 1)
11279 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11280 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11283 // We have different paths for blend lowering, but they all must use the
11284 // *exact* same predicate.
11285 bool IsBlendSupported = Subtarget.hasSSE41();
11286 if (IsBlendSupported)
11287 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11288 Zeroable, Subtarget, DAG))
11291 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11295 // Use dedicated unpack instructions for masks that match their pattern.
11297 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11300 // Try to use byte rotation instructions.
11301 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11302 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11305 if (SDValue BitBlend =
11306 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11309 // Try to lower by permuting the inputs into an unpack instruction.
11310 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11314 // If we can't directly blend but can use PSHUFB, that will be better as it
11315 // can both shuffle and set up the inefficient blend.
11316 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11317 bool V1InUse, V2InUse;
11318 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11319 Zeroable, DAG, V1InUse, V2InUse);
11322 // We can always bit-blend if we have to so the fallback strategy is to
11323 // decompose into single-input permutes and blends.
11324 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11328 /// \brief Check whether a compaction lowering can be done by dropping even
11329 /// elements and compute how many times even elements must be dropped.
11331 /// This handles shuffles which take every Nth element where N is a power of
11332 /// two. Example shuffle masks:
11334 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11335 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11336 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11337 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11338 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11339 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11341 /// Any of these lanes can of course be undef.
11343 /// This routine only supports N <= 3.
11344 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11347 /// \returns N above, or the number of times even elements must be dropped if
11348 /// there is such a number. Otherwise returns zero.
11349 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11350 bool IsSingleInput) {
11351 // The modulus for the shuffle vector entries is based on whether this is
11352 // a single input or not.
11353 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11354 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11355 "We should only be called with masks with a power-of-2 size!");
11357 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11359 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11360 // and 2^3 simultaneously. This is because we may have ambiguity with
11361 // partially undef inputs.
11362 bool ViableForN[3] = {true, true, true};
11364 for (int i = 0, e = Mask.size(); i < e; ++i) {
11365 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11370 bool IsAnyViable = false;
11371 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11372 if (ViableForN[j]) {
11373 uint64_t N = j + 1;
11375 // The shuffle mask must be equal to (i * 2^N) % M.
11376 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11377 IsAnyViable = true;
11379 ViableForN[j] = false;
11381 // Early exit if we exhaust the possible powers of two.
11386 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11390 // Return 0 as there is no viable power of two.
11394 /// \brief Generic lowering of v16i8 shuffles.
11396 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11397 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11398 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11399 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11401 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11402 const APInt &Zeroable,
11403 SDValue V1, SDValue V2,
11404 const X86Subtarget &Subtarget,
11405 SelectionDAG &DAG) {
11406 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11407 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11408 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11410 // Try to use shift instructions.
11411 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11412 Zeroable, Subtarget, DAG))
11415 // Try to use byte rotation instructions.
11416 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11417 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11420 // Try to use a zext lowering.
11421 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11422 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11425 // See if we can use SSE4A Extraction / Insertion.
11426 if (Subtarget.hasSSE4A())
11427 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11431 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11433 // For single-input shuffles, there are some nicer lowering tricks we can use.
11434 if (NumV2Elements == 0) {
11435 // Check for being able to broadcast a single element.
11436 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11437 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11440 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11441 // Notably, this handles splat and partial-splat shuffles more efficiently.
11442 // However, it only makes sense if the pre-duplication shuffle simplifies
11443 // things significantly. Currently, this means we need to be able to
11444 // express the pre-duplication shuffle as an i16 shuffle.
11446 // FIXME: We should check for other patterns which can be widened into an
11447 // i16 shuffle as well.
11448 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11449 for (int i = 0; i < 16; i += 2)
11450 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11455 auto tryToWidenViaDuplication = [&]() -> SDValue {
11456 if (!canWidenViaDuplication(Mask))
11458 SmallVector<int, 4> LoInputs;
11459 copy_if(Mask, std::back_inserter(LoInputs),
11460 [](int M) { return M >= 0 && M < 8; });
11461 std::sort(LoInputs.begin(), LoInputs.end());
11462 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11464 SmallVector<int, 4> HiInputs;
11465 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11466 std::sort(HiInputs.begin(), HiInputs.end());
11467 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11470 bool TargetLo = LoInputs.size() >= HiInputs.size();
11471 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11472 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11474 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11475 SmallDenseMap<int, int, 8> LaneMap;
11476 for (int I : InPlaceInputs) {
11477 PreDupI16Shuffle[I/2] = I/2;
11480 int j = TargetLo ? 0 : 4, je = j + 4;
11481 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11482 // Check if j is already a shuffle of this input. This happens when
11483 // there are two adjacent bytes after we move the low one.
11484 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11485 // If we haven't yet mapped the input, search for a slot into which
11487 while (j < je && PreDupI16Shuffle[j] >= 0)
11491 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11494 // Map this input with the i16 shuffle.
11495 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11498 // Update the lane map based on the mapping we ended up with.
11499 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11501 V1 = DAG.getBitcast(
11503 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11504 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11506 // Unpack the bytes to form the i16s that will be shuffled into place.
11507 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11508 MVT::v16i8, V1, V1);
11510 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11511 for (int i = 0; i < 16; ++i)
11512 if (Mask[i] >= 0) {
11513 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11514 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11515 if (PostDupI16Shuffle[i / 2] < 0)
11516 PostDupI16Shuffle[i / 2] = MappedMask;
11518 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11519 "Conflicting entries in the original shuffle!");
11521 return DAG.getBitcast(
11523 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11524 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11526 if (SDValue V = tryToWidenViaDuplication())
11530 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11534 // Use dedicated unpack instructions for masks that match their pattern.
11536 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11539 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11540 // with PSHUFB. It is important to do this before we attempt to generate any
11541 // blends but after all of the single-input lowerings. If the single input
11542 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11543 // want to preserve that and we can DAG combine any longer sequences into
11544 // a PSHUFB in the end. But once we start blending from multiple inputs,
11545 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11546 // and there are *very* few patterns that would actually be faster than the
11547 // PSHUFB approach because of its ability to zero lanes.
11549 // FIXME: The only exceptions to the above are blends which are exact
11550 // interleavings with direct instructions supporting them. We currently don't
11551 // handle those well here.
11552 if (Subtarget.hasSSSE3()) {
11553 bool V1InUse = false;
11554 bool V2InUse = false;
11556 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11557 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11559 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11560 // do so. This avoids using them to handle blends-with-zero which is
11561 // important as a single pshufb is significantly faster for that.
11562 if (V1InUse && V2InUse) {
11563 if (Subtarget.hasSSE41())
11564 if (SDValue Blend = lowerVectorShuffleAsBlend(
11565 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11568 // We can use an unpack to do the blending rather than an or in some
11569 // cases. Even though the or may be (very minorly) more efficient, we
11570 // preference this lowering because there are common cases where part of
11571 // the complexity of the shuffles goes away when we do the final blend as
11573 // FIXME: It might be worth trying to detect if the unpack-feeding
11574 // shuffles will both be pshufb, in which case we shouldn't bother with
11576 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11577 DL, MVT::v16i8, V1, V2, Mask, DAG))
11584 // There are special ways we can lower some single-element blends.
11585 if (NumV2Elements == 1)
11586 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11587 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11590 if (SDValue BitBlend =
11591 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11594 // Check whether a compaction lowering can be done. This handles shuffles
11595 // which take every Nth element for some even N. See the helper function for
11598 // We special case these as they can be particularly efficiently handled with
11599 // the PACKUSB instruction on x86 and they show up in common patterns of
11600 // rearranging bytes to truncate wide elements.
11601 bool IsSingleInput = V2.isUndef();
11602 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11603 // NumEvenDrops is the power of two stride of the elements. Another way of
11604 // thinking about it is that we need to drop the even elements this many
11605 // times to get the original input.
11607 // First we need to zero all the dropped bytes.
11608 assert(NumEvenDrops <= 3 &&
11609 "No support for dropping even elements more than 3 times.");
11610 // We use the mask type to pick which bytes are preserved based on how many
11611 // elements are dropped.
11612 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11613 SDValue ByteClearMask = DAG.getBitcast(
11614 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11615 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11616 if (!IsSingleInput)
11617 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11619 // Now pack things back together.
11620 V1 = DAG.getBitcast(MVT::v8i16, V1);
11621 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11622 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11623 for (int i = 1; i < NumEvenDrops; ++i) {
11624 Result = DAG.getBitcast(MVT::v8i16, Result);
11625 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11631 // Handle multi-input cases by blending single-input shuffles.
11632 if (NumV2Elements > 0)
11633 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11636 // The fallback path for single-input shuffles widens this into two v8i16
11637 // vectors with unpacks, shuffles those, and then pulls them back together
11641 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11642 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11643 for (int i = 0; i < 16; ++i)
11645 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11647 SDValue VLoHalf, VHiHalf;
11648 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11649 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11651 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11652 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11653 // Use a mask to drop the high bytes.
11654 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11655 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11656 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11658 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11659 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11661 // Squash the masks to point directly into VLoHalf.
11662 for (int &M : LoBlendMask)
11665 for (int &M : HiBlendMask)
11669 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11670 // VHiHalf so that we can blend them as i16s.
11671 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11673 VLoHalf = DAG.getBitcast(
11674 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11675 VHiHalf = DAG.getBitcast(
11676 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11679 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11680 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11682 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11685 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11687 /// This routine breaks down the specific type of 128-bit shuffle and
11688 /// dispatches to the lowering routines accordingly.
11689 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11690 MVT VT, SDValue V1, SDValue V2,
11691 const APInt &Zeroable,
11692 const X86Subtarget &Subtarget,
11693 SelectionDAG &DAG) {
11694 switch (VT.SimpleTy) {
11696 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11698 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11700 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11702 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11704 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11706 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11709 llvm_unreachable("Unimplemented!");
11713 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11715 /// This routine just extracts two subvectors, shuffles them independently, and
11716 /// then concatenates them back together. This should work effectively with all
11717 /// AVX vector shuffle types.
11718 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11719 SDValue V2, ArrayRef<int> Mask,
11720 SelectionDAG &DAG) {
11721 assert(VT.getSizeInBits() >= 256 &&
11722 "Only for 256-bit or wider vector shuffles!");
11723 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11724 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11726 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11727 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11729 int NumElements = VT.getVectorNumElements();
11730 int SplitNumElements = NumElements / 2;
11731 MVT ScalarVT = VT.getVectorElementType();
11732 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11734 // Rather than splitting build-vectors, just build two narrower build
11735 // vectors. This helps shuffling with splats and zeros.
11736 auto SplitVector = [&](SDValue V) {
11737 V = peekThroughBitcasts(V);
11739 MVT OrigVT = V.getSimpleValueType();
11740 int OrigNumElements = OrigVT.getVectorNumElements();
11741 int OrigSplitNumElements = OrigNumElements / 2;
11742 MVT OrigScalarVT = OrigVT.getVectorElementType();
11743 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11747 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11749 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11750 DAG.getIntPtrConstant(0, DL));
11751 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11752 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11755 SmallVector<SDValue, 16> LoOps, HiOps;
11756 for (int i = 0; i < OrigSplitNumElements; ++i) {
11757 LoOps.push_back(BV->getOperand(i));
11758 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11760 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11761 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11763 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11764 DAG.getBitcast(SplitVT, HiV));
11767 SDValue LoV1, HiV1, LoV2, HiV2;
11768 std::tie(LoV1, HiV1) = SplitVector(V1);
11769 std::tie(LoV2, HiV2) = SplitVector(V2);
11771 // Now create two 4-way blends of these half-width vectors.
11772 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11773 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11774 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11775 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11776 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11777 for (int i = 0; i < SplitNumElements; ++i) {
11778 int M = HalfMask[i];
11779 if (M >= NumElements) {
11780 if (M >= NumElements + SplitNumElements)
11784 V2BlendMask[i] = M - NumElements;
11785 BlendMask[i] = SplitNumElements + i;
11786 } else if (M >= 0) {
11787 if (M >= SplitNumElements)
11791 V1BlendMask[i] = M;
11796 // Because the lowering happens after all combining takes place, we need to
11797 // manually combine these blend masks as much as possible so that we create
11798 // a minimal number of high-level vector shuffle nodes.
11800 // First try just blending the halves of V1 or V2.
11801 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11802 return DAG.getUNDEF(SplitVT);
11803 if (!UseLoV2 && !UseHiV2)
11804 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11805 if (!UseLoV1 && !UseHiV1)
11806 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11808 SDValue V1Blend, V2Blend;
11809 if (UseLoV1 && UseHiV1) {
11811 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11813 // We only use half of V1 so map the usage down into the final blend mask.
11814 V1Blend = UseLoV1 ? LoV1 : HiV1;
11815 for (int i = 0; i < SplitNumElements; ++i)
11816 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11817 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11819 if (UseLoV2 && UseHiV2) {
11821 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11823 // We only use half of V2 so map the usage down into the final blend mask.
11824 V2Blend = UseLoV2 ? LoV2 : HiV2;
11825 for (int i = 0; i < SplitNumElements; ++i)
11826 if (BlendMask[i] >= SplitNumElements)
11827 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11829 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11831 SDValue Lo = HalfBlend(LoMask);
11832 SDValue Hi = HalfBlend(HiMask);
11833 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11836 /// \brief Either split a vector in halves or decompose the shuffles and the
11839 /// This is provided as a good fallback for many lowerings of non-single-input
11840 /// shuffles with more than one 128-bit lane. In those cases, we want to select
11841 /// between splitting the shuffle into 128-bit components and stitching those
11842 /// back together vs. extracting the single-input shuffles and blending those
11844 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11845 SDValue V1, SDValue V2,
11846 ArrayRef<int> Mask,
11847 SelectionDAG &DAG) {
11848 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11849 "shuffles as it could then recurse on itself.");
11850 int Size = Mask.size();
11852 // If this can be modeled as a broadcast of two elements followed by a blend,
11853 // prefer that lowering. This is especially important because broadcasts can
11854 // often fold with memory operands.
11855 auto DoBothBroadcast = [&] {
11856 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11859 if (V2BroadcastIdx < 0)
11860 V2BroadcastIdx = M - Size;
11861 else if (M - Size != V2BroadcastIdx)
11863 } else if (M >= 0) {
11864 if (V1BroadcastIdx < 0)
11865 V1BroadcastIdx = M;
11866 else if (M != V1BroadcastIdx)
11871 if (DoBothBroadcast())
11872 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11875 // If the inputs all stem from a single 128-bit lane of each input, then we
11876 // split them rather than blending because the split will decompose to
11877 // unusually few instructions.
11878 int LaneCount = VT.getSizeInBits() / 128;
11879 int LaneSize = Size / LaneCount;
11880 SmallBitVector LaneInputs[2];
11881 LaneInputs[0].resize(LaneCount, false);
11882 LaneInputs[1].resize(LaneCount, false);
11883 for (int i = 0; i < Size; ++i)
11885 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11886 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
11887 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11889 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
11890 // that the decomposed single-input shuffles don't end up here.
11891 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
11894 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
11895 /// a permutation and blend of those lanes.
11897 /// This essentially blends the out-of-lane inputs to each lane into the lane
11898 /// from a permuted copy of the vector. This lowering strategy results in four
11899 /// instructions in the worst case for a single-input cross lane shuffle which
11900 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
11901 /// of. Special cases for each particular shuffle pattern should be handled
11902 /// prior to trying this lowering.
11903 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
11904 SDValue V1, SDValue V2,
11905 ArrayRef<int> Mask,
11906 SelectionDAG &DAG) {
11907 // FIXME: This should probably be generalized for 512-bit vectors as well.
11908 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
11909 int Size = Mask.size();
11910 int LaneSize = Size / 2;
11912 // If there are only inputs from one 128-bit lane, splitting will in fact be
11913 // less expensive. The flags track whether the given lane contains an element
11914 // that crosses to another lane.
11915 bool LaneCrossing[2] = {false, false};
11916 for (int i = 0; i < Size; ++i)
11917 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11918 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
11919 if (!LaneCrossing[0] || !LaneCrossing[1])
11920 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11922 assert(V2.isUndef() &&
11923 "This last part of this routine only works on single input shuffles");
11925 SmallVector<int, 32> FlippedBlendMask(Size);
11926 for (int i = 0; i < Size; ++i)
11927 FlippedBlendMask[i] =
11928 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
11930 : Mask[i] % LaneSize +
11931 (i / LaneSize) * LaneSize + Size);
11933 // Flip the vector, and blend the results which should now be in-lane. The
11934 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
11935 // 5 for the high source. The value 3 selects the high half of source 2 and
11936 // the value 2 selects the low half of source 2. We only use source 2 to
11937 // allow folding it into a memory operand.
11938 unsigned PERMMask = 3 | 2 << 4;
11939 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
11940 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
11941 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
11944 /// \brief Handle lowering 2-lane 128-bit shuffles.
11945 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11946 SDValue V2, ArrayRef<int> Mask,
11947 const APInt &Zeroable,
11948 const X86Subtarget &Subtarget,
11949 SelectionDAG &DAG) {
11950 SmallVector<int, 4> WidenedMask;
11951 if (!canWidenShuffleElements(Mask, WidenedMask))
11954 // TODO: If minimizing size and one of the inputs is a zero vector and the
11955 // the zero vector has only one use, we could use a VPERM2X128 to save the
11956 // instruction bytes needed to explicitly generate the zero vector.
11958 // Blends are faster and handle all the non-lane-crossing cases.
11959 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
11960 Zeroable, Subtarget, DAG))
11963 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
11964 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
11966 // If either input operand is a zero vector, use VPERM2X128 because its mask
11967 // allows us to replace the zero input with an implicit zero.
11968 if (!IsV1Zero && !IsV2Zero) {
11969 // Check for patterns which can be matched with a single insert of a 128-bit
11971 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
11972 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
11973 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
11974 if (Subtarget.hasAVX2() && V2.isUndef())
11977 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
11978 VT.getVectorNumElements() / 2);
11979 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
11980 DAG.getIntPtrConstant(0, DL));
11981 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
11982 OnlyUsesV1 ? V1 : V2,
11983 DAG.getIntPtrConstant(0, DL));
11984 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
11988 // Otherwise form a 128-bit permutation. After accounting for undefs,
11989 // convert the 64-bit shuffle mask selection values into 128-bit
11990 // selection bits by dividing the indexes by 2 and shifting into positions
11991 // defined by a vperm2*128 instruction's immediate control byte.
11993 // The immediate permute control byte looks like this:
11994 // [1:0] - select 128 bits from sources for low half of destination
11996 // [3] - zero low half of destination
11997 // [5:4] - select 128 bits from sources for high half of destination
11999 // [7] - zero high half of destination
12001 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
12002 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
12004 unsigned PermMask = MaskLO | (MaskHI << 4);
12006 // If either input is a zero vector, replace it with an undef input.
12007 // Shuffle mask values < 4 are selecting elements of V1.
12008 // Shuffle mask values >= 4 are selecting elements of V2.
12009 // Adjust each half of the permute mask by clearing the half that was
12010 // selecting the zero vector and setting the zero mask bit.
12012 V1 = DAG.getUNDEF(VT);
12014 PermMask = (PermMask & 0xf0) | 0x08;
12016 PermMask = (PermMask & 0x0f) | 0x80;
12019 V2 = DAG.getUNDEF(VT);
12021 PermMask = (PermMask & 0xf0) | 0x08;
12023 PermMask = (PermMask & 0x0f) | 0x80;
12026 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12027 DAG.getConstant(PermMask, DL, MVT::i8));
12030 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12031 /// shuffling each lane.
12033 /// This will only succeed when the result of fixing the 128-bit lanes results
12034 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12035 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12036 /// the lane crosses early and then use simpler shuffles within each lane.
12038 /// FIXME: It might be worthwhile at some point to support this without
12039 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12040 /// in x86 only floating point has interesting non-repeating shuffles, and even
12041 /// those are still *marginally* more expensive.
12042 static SDValue lowerVectorShuffleByMerging128BitLanes(
12043 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12044 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12045 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12047 int Size = Mask.size();
12048 int LaneSize = 128 / VT.getScalarSizeInBits();
12049 int NumLanes = Size / LaneSize;
12050 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12052 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12053 // check whether the in-128-bit lane shuffles share a repeating pattern.
12054 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12055 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12056 for (int i = 0; i < Size; ++i) {
12060 int j = i / LaneSize;
12062 if (Lanes[j] < 0) {
12063 // First entry we've seen for this lane.
12064 Lanes[j] = Mask[i] / LaneSize;
12065 } else if (Lanes[j] != Mask[i] / LaneSize) {
12066 // This doesn't match the lane selected previously!
12070 // Check that within each lane we have a consistent shuffle mask.
12071 int k = i % LaneSize;
12072 if (InLaneMask[k] < 0) {
12073 InLaneMask[k] = Mask[i] % LaneSize;
12074 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12075 // This doesn't fit a repeating in-lane mask.
12080 // First shuffle the lanes into place.
12081 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12082 VT.getSizeInBits() / 64);
12083 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12084 for (int i = 0; i < NumLanes; ++i)
12085 if (Lanes[i] >= 0) {
12086 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12087 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12090 V1 = DAG.getBitcast(LaneVT, V1);
12091 V2 = DAG.getBitcast(LaneVT, V2);
12092 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12094 // Cast it back to the type we actually want.
12095 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12097 // Now do a simple shuffle that isn't lane crossing.
12098 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12099 for (int i = 0; i < Size; ++i)
12101 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12102 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12103 "Must not introduce lane crosses at this point!");
12105 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12108 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12109 /// This allows for fast cases such as subvector extraction/insertion
12110 /// or shuffling smaller vector types which can lower more efficiently.
12111 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12112 SDValue V1, SDValue V2,
12113 ArrayRef<int> Mask,
12114 const X86Subtarget &Subtarget,
12115 SelectionDAG &DAG) {
12116 assert(VT.is256BitVector() && "Expected 256-bit vector");
12118 unsigned NumElts = VT.getVectorNumElements();
12119 unsigned HalfNumElts = NumElts / 2;
12120 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12122 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12123 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12124 if (!UndefLower && !UndefUpper)
12127 // Upper half is undef and lower half is whole upper subvector.
12128 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12130 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12131 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12132 DAG.getIntPtrConstant(HalfNumElts, DL));
12133 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12134 DAG.getIntPtrConstant(0, DL));
12137 // Lower half is undef and upper half is whole lower subvector.
12138 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12140 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12141 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12142 DAG.getIntPtrConstant(0, DL));
12143 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12144 DAG.getIntPtrConstant(HalfNumElts, DL));
12147 // If the shuffle only uses two of the four halves of the input operands,
12148 // then extract them and perform the 'half' shuffle at half width.
12149 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12150 int HalfIdx1 = -1, HalfIdx2 = -1;
12151 SmallVector<int, 8> HalfMask(HalfNumElts);
12152 unsigned Offset = UndefLower ? HalfNumElts : 0;
12153 for (unsigned i = 0; i != HalfNumElts; ++i) {
12154 int M = Mask[i + Offset];
12160 // Determine which of the 4 half vectors this element is from.
12161 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12162 int HalfIdx = M / HalfNumElts;
12164 // Determine the element index into its half vector source.
12165 int HalfElt = M % HalfNumElts;
12167 // We can shuffle with up to 2 half vectors, set the new 'half'
12168 // shuffle mask accordingly.
12169 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12170 HalfMask[i] = HalfElt;
12171 HalfIdx1 = HalfIdx;
12174 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12175 HalfMask[i] = HalfElt + HalfNumElts;
12176 HalfIdx2 = HalfIdx;
12180 // Too many half vectors referenced.
12183 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12185 // Only shuffle the halves of the inputs when useful.
12186 int NumLowerHalves =
12187 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12188 int NumUpperHalves =
12189 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12191 // uuuuXXXX - don't extract uppers just to insert again.
12192 if (UndefLower && NumUpperHalves != 0)
12195 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12196 if (UndefUpper && NumUpperHalves == 2)
12199 // AVX2 - XXXXuuuu - always extract lowers.
12200 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12201 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12202 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12204 // AVX2 supports variable 32-bit element cross-lane shuffles.
12205 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12206 // XXXXuuuu - don't extract lowers and uppers.
12207 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12212 auto GetHalfVector = [&](int HalfIdx) {
12214 return DAG.getUNDEF(HalfVT);
12215 SDValue V = (HalfIdx < 2 ? V1 : V2);
12216 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12217 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12218 DAG.getIntPtrConstant(HalfIdx, DL));
12221 SDValue Half1 = GetHalfVector(HalfIdx1);
12222 SDValue Half2 = GetHalfVector(HalfIdx2);
12223 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12224 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12225 DAG.getIntPtrConstant(Offset, DL));
12228 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12231 /// This returns true if the elements from a particular input are already in the
12232 /// slot required by the given mask and require no permutation.
12233 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12234 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12235 int Size = Mask.size();
12236 for (int i = 0; i < Size; ++i)
12237 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12243 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12244 /// every lane can be represented as the same repeating mask - allowing us to
12245 /// shuffle the sources with the repeating shuffle and then permute the result
12246 /// to the destination lanes.
12247 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12248 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12249 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12250 int NumElts = VT.getVectorNumElements();
12251 int NumLanes = VT.getSizeInBits() / 128;
12252 int NumLaneElts = NumElts / NumLanes;
12254 // On AVX2 we may be able to just shuffle the lowest elements and then
12255 // broadcast the result.
12256 if (Subtarget.hasAVX2()) {
12257 for (unsigned BroadcastSize : {16, 32, 64}) {
12258 if (BroadcastSize <= VT.getScalarSizeInBits())
12260 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12262 // Attempt to match a repeating pattern every NumBroadcastElts,
12263 // accounting for UNDEFs but only references the lowest 128-bit
12264 // lane of the inputs.
12265 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12266 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12267 for (int j = 0; j != NumBroadcastElts; ++j) {
12268 int M = Mask[i + j];
12271 int &R = RepeatMask[j];
12272 if (0 != ((M % NumElts) / NumLaneElts))
12274 if (0 <= R && R != M)
12281 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12282 if (!FindRepeatingBroadcastMask(RepeatMask))
12285 // Shuffle the (lowest) repeated elements in place for broadcast.
12286 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12288 // Shuffle the actual broadcast.
12289 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12290 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12291 for (int j = 0; j != NumBroadcastElts; ++j)
12292 BroadcastMask[i + j] = j;
12293 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12298 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12299 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12302 // Bail if we already have a repeated lane shuffle mask.
12303 SmallVector<int, 8> RepeatedShuffleMask;
12304 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12307 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12308 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12309 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12310 int NumSubLanes = NumLanes * SubLaneScale;
12311 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12313 // Check that all the sources are coming from the same lane and see if we can
12314 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12315 // determine the source sub-lane for each destination sub-lane.
12316 int TopSrcSubLane = -1;
12317 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12318 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12319 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12320 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12322 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12323 // Extract the sub-lane mask, check that it all comes from the same lane
12324 // and normalize the mask entries to come from the first lane.
12326 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12327 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12328 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12331 int Lane = (M % NumElts) / NumLaneElts;
12332 if ((0 <= SrcLane) && (SrcLane != Lane))
12335 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12336 SubLaneMask[Elt] = LocalM;
12339 // Whole sub-lane is UNDEF.
12343 // Attempt to match against the candidate repeated sub-lane masks.
12344 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12345 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12346 for (int i = 0; i != NumSubLaneElts; ++i) {
12347 if (M1[i] < 0 || M2[i] < 0)
12349 if (M1[i] != M2[i])
12355 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12356 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12359 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12360 for (int i = 0; i != NumSubLaneElts; ++i) {
12361 int M = SubLaneMask[i];
12364 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12365 "Unexpected mask element");
12366 RepeatedSubLaneMask[i] = M;
12369 // Track the top most source sub-lane - by setting the remaining to UNDEF
12370 // we can greatly simplify shuffle matching.
12371 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12372 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12373 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12377 // Bail if we failed to find a matching repeated sub-lane mask.
12378 if (Dst2SrcSubLanes[DstSubLane] < 0)
12381 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12382 "Unexpected source lane");
12384 // Create a repeating shuffle mask for the entire vector.
12385 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12386 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12387 int Lane = SubLane / SubLaneScale;
12388 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12389 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12390 int M = RepeatedSubLaneMask[Elt];
12393 int Idx = (SubLane * NumSubLaneElts) + Elt;
12394 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12397 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12399 // Shuffle each source sub-lane to its destination.
12400 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12401 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12402 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12403 if (SrcSubLane < 0)
12405 for (int j = 0; j != NumSubLaneElts; ++j)
12406 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12409 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12413 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12414 unsigned &ShuffleImm,
12415 ArrayRef<int> Mask) {
12416 int NumElts = VT.getVectorNumElements();
12417 assert(VT.getScalarSizeInBits() == 64 &&
12418 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12419 "Unexpected data type for VSHUFPD");
12421 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12422 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12424 bool ShufpdMask = true;
12425 bool CommutableMask = true;
12426 for (int i = 0; i < NumElts; ++i) {
12427 if (Mask[i] == SM_SentinelUndef)
12431 int Val = (i & 6) + NumElts * (i & 1);
12432 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12433 if (Mask[i] < Val || Mask[i] > Val + 1)
12434 ShufpdMask = false;
12435 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12436 CommutableMask = false;
12437 ShuffleImm |= (Mask[i] % 2) << i;
12442 if (CommutableMask) {
12450 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12451 ArrayRef<int> Mask, SDValue V1,
12452 SDValue V2, SelectionDAG &DAG) {
12453 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12454 "Unexpected data type for VSHUFPD");
12456 unsigned Immediate = 0;
12457 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12460 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12461 DAG.getConstant(Immediate, DL, MVT::i8));
12464 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12465 ArrayRef<int> Mask, SDValue V1,
12466 SDValue V2, SelectionDAG &DAG) {
12467 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12468 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12470 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12472 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12474 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12477 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12479 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12480 /// isn't available.
12481 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12482 const APInt &Zeroable,
12483 SDValue V1, SDValue V2,
12484 const X86Subtarget &Subtarget,
12485 SelectionDAG &DAG) {
12486 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12487 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12488 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12490 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12491 Zeroable, Subtarget, DAG))
12494 if (V2.isUndef()) {
12495 // Check for being able to broadcast a single element.
12496 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12497 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12500 // Use low duplicate instructions for masks that match their pattern.
12501 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12502 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12504 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12505 // Non-half-crossing single input shuffles can be lowered with an
12506 // interleaved permutation.
12507 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12508 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12509 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12510 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12513 // With AVX2 we have direct support for this permutation.
12514 if (Subtarget.hasAVX2())
12515 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12516 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12518 // Try to create an in-lane repeating shuffle mask and then shuffle the
12519 // the results into the target lanes.
12520 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12521 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12524 // Otherwise, fall back.
12525 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12529 // Use dedicated unpack instructions for masks that match their pattern.
12531 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12534 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12535 Zeroable, Subtarget, DAG))
12538 // Check if the blend happens to exactly fit that of SHUFPD.
12540 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12543 // Try to create an in-lane repeating shuffle mask and then shuffle the
12544 // the results into the target lanes.
12545 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12546 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12549 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12550 // shuffle. However, if we have AVX2 and either inputs are already in place,
12551 // we will be able to shuffle even across lanes the other input in a single
12552 // instruction so skip this pattern.
12553 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12554 isShuffleMaskInputInPlace(1, Mask))))
12555 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12556 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12558 // If we have VLX support, we can use VEXPAND.
12559 if (Subtarget.hasVLX())
12560 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12561 V1, V2, DAG, Subtarget))
12564 // If we have AVX2 then we always want to lower with a blend because an v4 we
12565 // can fully permute the elements.
12566 if (Subtarget.hasAVX2())
12567 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12570 // Otherwise fall back on generic lowering.
12571 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12574 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12576 /// This routine is only called when we have AVX2 and thus a reasonable
12577 /// instruction set for v4i64 shuffling..
12578 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12579 const APInt &Zeroable,
12580 SDValue V1, SDValue V2,
12581 const X86Subtarget &Subtarget,
12582 SelectionDAG &DAG) {
12583 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12584 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12585 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12586 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12588 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12589 Zeroable, Subtarget, DAG))
12592 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12593 Zeroable, Subtarget, DAG))
12596 // Check for being able to broadcast a single element.
12597 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12598 Mask, Subtarget, DAG))
12601 if (V2.isUndef()) {
12602 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12603 // can use lower latency instructions that will operate on both lanes.
12604 SmallVector<int, 2> RepeatedMask;
12605 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12606 SmallVector<int, 4> PSHUFDMask;
12607 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12608 return DAG.getBitcast(
12610 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12611 DAG.getBitcast(MVT::v8i32, V1),
12612 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12615 // AVX2 provides a direct instruction for permuting a single input across
12617 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12618 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12621 // Try to use shift instructions.
12622 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12623 Zeroable, Subtarget, DAG))
12626 // If we have VLX support, we can use VALIGN or VEXPAND.
12627 if (Subtarget.hasVLX()) {
12628 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12629 Mask, Subtarget, DAG))
12632 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12633 V1, V2, DAG, Subtarget))
12637 // Try to use PALIGNR.
12638 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12639 Mask, Subtarget, DAG))
12642 // Use dedicated unpack instructions for masks that match their pattern.
12644 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12647 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12648 // shuffle. However, if we have AVX2 and either inputs are already in place,
12649 // we will be able to shuffle even across lanes the other input in a single
12650 // instruction so skip this pattern.
12651 if (!isShuffleMaskInputInPlace(0, Mask) &&
12652 !isShuffleMaskInputInPlace(1, Mask))
12653 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12654 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12657 // Otherwise fall back on generic blend lowering.
12658 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12662 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12664 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12665 /// isn't available.
12666 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12667 const APInt &Zeroable,
12668 SDValue V1, SDValue V2,
12669 const X86Subtarget &Subtarget,
12670 SelectionDAG &DAG) {
12671 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12672 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12673 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12675 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12676 Zeroable, Subtarget, DAG))
12679 // Check for being able to broadcast a single element.
12680 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12681 Mask, Subtarget, DAG))
12684 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12685 // options to efficiently lower the shuffle.
12686 SmallVector<int, 4> RepeatedMask;
12687 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12688 assert(RepeatedMask.size() == 4 &&
12689 "Repeated masks must be half the mask width!");
12691 // Use even/odd duplicate instructions for masks that match their pattern.
12692 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12693 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12694 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12695 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12698 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12699 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12701 // Use dedicated unpack instructions for masks that match their pattern.
12703 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12706 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12707 // have already handled any direct blends.
12708 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12711 // Try to create an in-lane repeating shuffle mask and then shuffle the
12712 // the results into the target lanes.
12713 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12714 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12717 // If we have a single input shuffle with different shuffle patterns in the
12718 // two 128-bit lanes use the variable mask to VPERMILPS.
12719 if (V2.isUndef()) {
12720 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12721 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12722 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12724 if (Subtarget.hasAVX2())
12725 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12727 // Otherwise, fall back.
12728 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12732 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12734 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12735 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12737 // If we have VLX support, we can use VEXPAND.
12738 if (Subtarget.hasVLX())
12739 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12740 V1, V2, DAG, Subtarget))
12743 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12744 // since after split we get a more efficient code using vpunpcklwd and
12745 // vpunpckhwd instrs than vblend.
12746 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12747 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12751 // If we have AVX2 then we always want to lower with a blend because at v8 we
12752 // can fully permute the elements.
12753 if (Subtarget.hasAVX2())
12754 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12757 // Otherwise fall back on generic lowering.
12758 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12761 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12763 /// This routine is only called when we have AVX2 and thus a reasonable
12764 /// instruction set for v8i32 shuffling..
12765 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12766 const APInt &Zeroable,
12767 SDValue V1, SDValue V2,
12768 const X86Subtarget &Subtarget,
12769 SelectionDAG &DAG) {
12770 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12771 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12772 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12773 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12775 // Whenever we can lower this as a zext, that instruction is strictly faster
12776 // than any alternative. It also allows us to fold memory operands into the
12777 // shuffle in many cases.
12778 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12779 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12782 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12783 // since after split we get a more efficient code than vblend by using
12784 // vpunpcklwd and vpunpckhwd instrs.
12785 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12786 !Subtarget.hasAVX512())
12788 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12791 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12792 Zeroable, Subtarget, DAG))
12795 // Check for being able to broadcast a single element.
12796 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12797 Mask, Subtarget, DAG))
12800 // If the shuffle mask is repeated in each 128-bit lane we can use more
12801 // efficient instructions that mirror the shuffles across the two 128-bit
12803 SmallVector<int, 4> RepeatedMask;
12804 bool Is128BitLaneRepeatedShuffle =
12805 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12806 if (Is128BitLaneRepeatedShuffle) {
12807 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12809 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12810 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12812 // Use dedicated unpack instructions for masks that match their pattern.
12814 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12818 // Try to use shift instructions.
12819 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12820 Zeroable, Subtarget, DAG))
12823 // If we have VLX support, we can use VALIGN or EXPAND.
12824 if (Subtarget.hasVLX()) {
12825 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12826 Mask, Subtarget, DAG))
12829 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
12830 V1, V2, DAG, Subtarget))
12834 // Try to use byte rotation instructions.
12835 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12836 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12839 // Try to create an in-lane repeating shuffle mask and then shuffle the
12840 // results into the target lanes.
12841 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12842 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12845 // If the shuffle patterns aren't repeated but it is a single input, directly
12846 // generate a cross-lane VPERMD instruction.
12847 if (V2.isUndef()) {
12848 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12849 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12852 // Assume that a single SHUFPS is faster than an alternative sequence of
12853 // multiple instructions (even if the CPU has a domain penalty).
12854 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12855 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12856 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12857 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12858 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12859 CastV1, CastV2, DAG);
12860 return DAG.getBitcast(MVT::v8i32, ShufPS);
12863 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12865 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12866 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12869 // Otherwise fall back on generic blend lowering.
12870 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12874 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12876 /// This routine is only called when we have AVX2 and thus a reasonable
12877 /// instruction set for v16i16 shuffling..
12878 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12879 const APInt &Zeroable,
12880 SDValue V1, SDValue V2,
12881 const X86Subtarget &Subtarget,
12882 SelectionDAG &DAG) {
12883 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12884 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12885 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12886 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
12888 // Whenever we can lower this as a zext, that instruction is strictly faster
12889 // than any alternative. It also allows us to fold memory operands into the
12890 // shuffle in many cases.
12891 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12892 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12895 // Check for being able to broadcast a single element.
12896 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
12897 Mask, Subtarget, DAG))
12900 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
12901 Zeroable, Subtarget, DAG))
12904 // Use dedicated unpack instructions for masks that match their pattern.
12906 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
12909 // Try to use shift instructions.
12910 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12911 Zeroable, Subtarget, DAG))
12914 // Try to use byte rotation instructions.
12915 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12916 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12919 // Try to create an in-lane repeating shuffle mask and then shuffle the
12920 // the results into the target lanes.
12921 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12922 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12925 if (V2.isUndef()) {
12926 // There are no generalized cross-lane shuffle operations available on i16
12928 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
12929 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
12932 SmallVector<int, 8> RepeatedMask;
12933 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12934 // As this is a single-input shuffle, the repeated mask should be
12935 // a strictly valid v8i16 mask that we can pass through to the v8i16
12936 // lowering to handle even the v16 case.
12937 return lowerV8I16GeneralSingleInputVectorShuffle(
12938 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
12942 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12943 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
12946 // AVX512BWVL can lower to VPERMW.
12947 if (Subtarget.hasBWI() && Subtarget.hasVLX())
12948 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
12950 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12952 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12953 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12956 // Otherwise fall back on generic lowering.
12957 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
12960 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
12962 /// This routine is only called when we have AVX2 and thus a reasonable
12963 /// instruction set for v32i8 shuffling..
12964 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12965 const APInt &Zeroable,
12966 SDValue V1, SDValue V2,
12967 const X86Subtarget &Subtarget,
12968 SelectionDAG &DAG) {
12969 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12970 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12971 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12972 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
12974 // Whenever we can lower this as a zext, that instruction is strictly faster
12975 // than any alternative. It also allows us to fold memory operands into the
12976 // shuffle in many cases.
12977 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12978 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12981 // Check for being able to broadcast a single element.
12982 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
12983 Mask, Subtarget, DAG))
12986 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
12987 Zeroable, Subtarget, DAG))
12990 // Use dedicated unpack instructions for masks that match their pattern.
12992 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
12995 // Try to use shift instructions.
12996 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
12997 Zeroable, Subtarget, DAG))
13000 // Try to use byte rotation instructions.
13001 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13002 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13005 // Try to create an in-lane repeating shuffle mask and then shuffle the
13006 // the results into the target lanes.
13007 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13008 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13011 // There are no generalized cross-lane shuffle operations available on i8
13013 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13014 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13017 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13018 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13021 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13023 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13024 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13027 // Otherwise fall back on generic lowering.
13028 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13031 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13033 /// This routine either breaks down the specific type of a 256-bit x86 vector
13034 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13035 /// together based on the available instructions.
13036 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13037 MVT VT, SDValue V1, SDValue V2,
13038 const APInt &Zeroable,
13039 const X86Subtarget &Subtarget,
13040 SelectionDAG &DAG) {
13041 // If we have a single input to the zero element, insert that into V1 if we
13042 // can do so cheaply.
13043 int NumElts = VT.getVectorNumElements();
13044 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13046 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13047 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13048 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13051 // Handle special cases where the lower or upper half is UNDEF.
13053 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13056 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13057 // can check for those subtargets here and avoid much of the subtarget
13058 // querying in the per-vector-type lowering routines. With AVX1 we have
13059 // essentially *zero* ability to manipulate a 256-bit vector with integer
13060 // types. Since we'll use floating point types there eventually, just
13061 // immediately cast everything to a float and operate entirely in that domain.
13062 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13063 int ElementBits = VT.getScalarSizeInBits();
13064 if (ElementBits < 32) {
13065 // No floating point type available, if we can't use the bit operations
13066 // for masking/blending then decompose into 128-bit vectors.
13068 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13070 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13072 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13075 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13076 VT.getVectorNumElements());
13077 V1 = DAG.getBitcast(FpVT, V1);
13078 V2 = DAG.getBitcast(FpVT, V2);
13079 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13082 switch (VT.SimpleTy) {
13084 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13086 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13088 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13090 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13092 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13094 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13097 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13101 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13102 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13103 ArrayRef<int> Mask, SDValue V1,
13104 SDValue V2, SelectionDAG &DAG) {
13105 assert(VT.getScalarSizeInBits() == 64 &&
13106 "Unexpected element type size for 128bit shuffle.");
13108 // To handle 256 bit vector requires VLX and most probably
13109 // function lowerV2X128VectorShuffle() is better solution.
13110 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13112 SmallVector<int, 4> WidenedMask;
13113 if (!canWidenShuffleElements(Mask, WidenedMask))
13116 // Check for patterns which can be matched with a single insert of a 256-bit
13118 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13119 {0, 1, 2, 3, 0, 1, 2, 3});
13120 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13121 {0, 1, 2, 3, 8, 9, 10, 11})) {
13122 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13123 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13124 DAG.getIntPtrConstant(0, DL));
13125 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13126 OnlyUsesV1 ? V1 : V2,
13127 DAG.getIntPtrConstant(0, DL));
13128 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13131 assert(WidenedMask.size() == 4);
13133 // See if this is an insertion of the lower 128-bits of V2 into V1.
13134 bool IsInsert = true;
13136 for (int i = 0; i < 4; ++i) {
13137 assert(WidenedMask[i] >= -1);
13138 if (WidenedMask[i] < 0)
13141 // Make sure all V1 subvectors are in place.
13142 if (WidenedMask[i] < 4) {
13143 if (WidenedMask[i] != i) {
13148 // Make sure we only have a single V2 index and its the lowest 128-bits.
13149 if (V2Index >= 0 || WidenedMask[i] != 4) {
13156 if (IsInsert && V2Index >= 0) {
13157 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13158 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13159 DAG.getIntPtrConstant(0, DL));
13160 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13163 // Try to lower to to vshuf64x2/vshuf32x4.
13164 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13165 unsigned PermMask = 0;
13166 // Insure elements came from the same Op.
13167 for (int i = 0; i < 4; ++i) {
13168 assert(WidenedMask[i] >= -1);
13169 if (WidenedMask[i] < 0)
13172 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13173 unsigned OpIndex = i / 2;
13174 if (Ops[OpIndex].isUndef())
13176 else if (Ops[OpIndex] != Op)
13179 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13180 // bits defined by a vshuf64x2 instruction's immediate control byte.
13181 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13184 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13185 DAG.getConstant(PermMask, DL, MVT::i8));
13188 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13189 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13190 const APInt &Zeroable,
13191 SDValue V1, SDValue V2,
13192 const X86Subtarget &Subtarget,
13193 SelectionDAG &DAG) {
13194 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13195 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13196 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13198 if (V2.isUndef()) {
13199 // Use low duplicate instructions for masks that match their pattern.
13200 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13201 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13203 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13204 // Non-half-crossing single input shuffles can be lowered with an
13205 // interleaved permutation.
13206 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13207 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13208 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13209 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13210 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13211 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13214 SmallVector<int, 4> RepeatedMask;
13215 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13216 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13217 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13220 if (SDValue Shuf128 =
13221 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13224 if (SDValue Unpck =
13225 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13228 // Check if the blend happens to exactly fit that of SHUFPD.
13230 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13233 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13234 V2, DAG, Subtarget))
13237 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13238 Zeroable, Subtarget, DAG))
13241 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13244 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13245 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13246 const APInt &Zeroable,
13247 SDValue V1, SDValue V2,
13248 const X86Subtarget &Subtarget,
13249 SelectionDAG &DAG) {
13250 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13251 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13252 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13254 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13255 // options to efficiently lower the shuffle.
13256 SmallVector<int, 4> RepeatedMask;
13257 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13258 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13260 // Use even/odd duplicate instructions for masks that match their pattern.
13261 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13262 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13263 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13264 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13267 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13268 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13270 // Use dedicated unpack instructions for masks that match their pattern.
13271 if (SDValue Unpck =
13272 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13275 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13276 Zeroable, Subtarget, DAG))
13279 // Otherwise, fall back to a SHUFPS sequence.
13280 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13282 // If we have AVX512F support, we can use VEXPAND.
13283 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13284 V1, V2, DAG, Subtarget))
13287 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13290 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13291 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13292 const APInt &Zeroable,
13293 SDValue V1, SDValue V2,
13294 const X86Subtarget &Subtarget,
13295 SelectionDAG &DAG) {
13296 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13297 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13298 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13300 if (SDValue Shuf128 =
13301 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13304 if (V2.isUndef()) {
13305 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13306 // can use lower latency instructions that will operate on all four
13308 SmallVector<int, 2> Repeated128Mask;
13309 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13310 SmallVector<int, 4> PSHUFDMask;
13311 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13312 return DAG.getBitcast(
13314 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13315 DAG.getBitcast(MVT::v16i32, V1),
13316 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13319 SmallVector<int, 4> Repeated256Mask;
13320 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13321 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13322 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13325 // Try to use shift instructions.
13326 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13327 Zeroable, Subtarget, DAG))
13330 // Try to use VALIGN.
13331 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13332 Mask, Subtarget, DAG))
13335 // Try to use PALIGNR.
13336 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13337 Mask, Subtarget, DAG))
13340 if (SDValue Unpck =
13341 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13343 // If we have AVX512F support, we can use VEXPAND.
13344 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13345 V2, DAG, Subtarget))
13348 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13349 Zeroable, Subtarget, DAG))
13352 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13355 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13356 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13357 const APInt &Zeroable,
13358 SDValue V1, SDValue V2,
13359 const X86Subtarget &Subtarget,
13360 SelectionDAG &DAG) {
13361 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13362 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13363 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13365 // Whenever we can lower this as a zext, that instruction is strictly faster
13366 // than any alternative. It also allows us to fold memory operands into the
13367 // shuffle in many cases.
13368 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13369 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13372 // If the shuffle mask is repeated in each 128-bit lane we can use more
13373 // efficient instructions that mirror the shuffles across the four 128-bit
13375 SmallVector<int, 4> RepeatedMask;
13376 bool Is128BitLaneRepeatedShuffle =
13377 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13378 if (Is128BitLaneRepeatedShuffle) {
13379 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13381 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13382 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13384 // Use dedicated unpack instructions for masks that match their pattern.
13386 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13390 // Try to use shift instructions.
13391 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13392 Zeroable, Subtarget, DAG))
13395 // Try to use VALIGN.
13396 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13397 Mask, Subtarget, DAG))
13400 // Try to use byte rotation instructions.
13401 if (Subtarget.hasBWI())
13402 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13403 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13406 // Assume that a single SHUFPS is faster than using a permv shuffle.
13407 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13408 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13409 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13410 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13411 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13412 CastV1, CastV2, DAG);
13413 return DAG.getBitcast(MVT::v16i32, ShufPS);
13415 // If we have AVX512F support, we can use VEXPAND.
13416 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13417 V1, V2, DAG, Subtarget))
13420 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13421 Zeroable, Subtarget, DAG))
13423 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13426 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13427 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13428 const APInt &Zeroable,
13429 SDValue V1, SDValue V2,
13430 const X86Subtarget &Subtarget,
13431 SelectionDAG &DAG) {
13432 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13433 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13434 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13435 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13437 // Whenever we can lower this as a zext, that instruction is strictly faster
13438 // than any alternative. It also allows us to fold memory operands into the
13439 // shuffle in many cases.
13440 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13441 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13444 // Use dedicated unpack instructions for masks that match their pattern.
13446 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13449 // Try to use shift instructions.
13450 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13451 Zeroable, Subtarget, DAG))
13454 // Try to use byte rotation instructions.
13455 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13456 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13459 if (V2.isUndef()) {
13460 SmallVector<int, 8> RepeatedMask;
13461 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13462 // As this is a single-input shuffle, the repeated mask should be
13463 // a strictly valid v8i16 mask that we can pass through to the v8i16
13464 // lowering to handle even the v32 case.
13465 return lowerV8I16GeneralSingleInputVectorShuffle(
13466 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13470 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13471 Zeroable, Subtarget, DAG))
13474 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13477 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13478 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13479 const APInt &Zeroable,
13480 SDValue V1, SDValue V2,
13481 const X86Subtarget &Subtarget,
13482 SelectionDAG &DAG) {
13483 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13484 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13485 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13486 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13488 // Whenever we can lower this as a zext, that instruction is strictly faster
13489 // than any alternative. It also allows us to fold memory operands into the
13490 // shuffle in many cases.
13491 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13492 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13495 // Use dedicated unpack instructions for masks that match their pattern.
13497 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13500 // Try to use shift instructions.
13501 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13502 Zeroable, Subtarget, DAG))
13505 // Try to use byte rotation instructions.
13506 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13507 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13510 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13511 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13514 // VBMI can use VPERMV/VPERMV3 byte shuffles.
13515 if (Subtarget.hasVBMI())
13516 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13518 // Try to create an in-lane repeating shuffle mask and then shuffle the
13519 // the results into the target lanes.
13520 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13521 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13524 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13525 Zeroable, Subtarget, DAG))
13528 // FIXME: Implement direct support for this type!
13529 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13532 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13534 /// This routine either breaks down the specific type of a 512-bit x86 vector
13535 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13536 /// together based on the available instructions.
13537 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13538 MVT VT, SDValue V1, SDValue V2,
13539 const APInt &Zeroable,
13540 const X86Subtarget &Subtarget,
13541 SelectionDAG &DAG) {
13542 assert(Subtarget.hasAVX512() &&
13543 "Cannot lower 512-bit vectors w/ basic ISA!");
13545 // If we have a single input to the zero element, insert that into V1 if we
13546 // can do so cheaply.
13547 int NumElts = Mask.size();
13548 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13550 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13551 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13552 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13555 // Check for being able to broadcast a single element.
13556 if (SDValue Broadcast =
13557 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13560 // Dispatch to each element type for lowering. If we don't have support for
13561 // specific element type shuffles at 512 bits, immediately split them and
13562 // lower them. Each lowering routine of a given type is allowed to assume that
13563 // the requisite ISA extensions for that element type are available.
13564 switch (VT.SimpleTy) {
13566 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13568 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13570 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13572 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13574 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13576 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13579 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13583 // Lower vXi1 vector shuffles.
13584 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13585 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13586 // vector, shuffle and then truncate it back.
13587 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13588 MVT VT, SDValue V1, SDValue V2,
13589 const X86Subtarget &Subtarget,
13590 SelectionDAG &DAG) {
13591 assert(Subtarget.hasAVX512() &&
13592 "Cannot lower 512-bit vectors w/o basic ISA!");
13594 switch (VT.SimpleTy) {
13596 llvm_unreachable("Expected a vector of i1 elements");
13598 ExtVT = MVT::v2i64;
13601 ExtVT = MVT::v4i32;
13604 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13607 ExtVT = MVT::v16i32;
13610 ExtVT = MVT::v32i16;
13613 ExtVT = MVT::v64i8;
13617 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13618 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13619 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13620 V1 = getOnesVector(ExtVT, DAG, DL);
13622 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13625 V2 = DAG.getUNDEF(ExtVT);
13626 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13627 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13628 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13629 V2 = getOnesVector(ExtVT, DAG, DL);
13631 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13633 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13634 // i1 was sign extended we can use X86ISD::CVT2MASK.
13635 int NumElems = VT.getVectorNumElements();
13636 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13637 (Subtarget.hasDQI() && (NumElems < 32)))
13638 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13640 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13643 /// Helper function that returns true if the shuffle mask should be
13644 /// commuted to improve canonicalization.
13645 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13646 int NumElements = Mask.size();
13648 int NumV1Elements = 0, NumV2Elements = 0;
13652 else if (M < NumElements)
13657 // Commute the shuffle as needed such that more elements come from V1 than
13658 // V2. This allows us to match the shuffle pattern strictly on how many
13659 // elements come from V1 without handling the symmetric cases.
13660 if (NumV2Elements > NumV1Elements)
13663 assert(NumV1Elements > 0 && "No V1 indices");
13665 if (NumV2Elements == 0)
13668 // When the number of V1 and V2 elements are the same, try to minimize the
13669 // number of uses of V2 in the low half of the vector. When that is tied,
13670 // ensure that the sum of indices for V1 is equal to or lower than the sum
13671 // indices for V2. When those are equal, try to ensure that the number of odd
13672 // indices for V1 is lower than the number of odd indices for V2.
13673 if (NumV1Elements == NumV2Elements) {
13674 int LowV1Elements = 0, LowV2Elements = 0;
13675 for (int M : Mask.slice(0, NumElements / 2))
13676 if (M >= NumElements)
13680 if (LowV2Elements > LowV1Elements)
13682 if (LowV2Elements == LowV1Elements) {
13683 int SumV1Indices = 0, SumV2Indices = 0;
13684 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13685 if (Mask[i] >= NumElements)
13687 else if (Mask[i] >= 0)
13689 if (SumV2Indices < SumV1Indices)
13691 if (SumV2Indices == SumV1Indices) {
13692 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13693 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13694 if (Mask[i] >= NumElements)
13695 NumV2OddIndices += i % 2;
13696 else if (Mask[i] >= 0)
13697 NumV1OddIndices += i % 2;
13698 if (NumV2OddIndices < NumV1OddIndices)
13707 /// \brief Top-level lowering for x86 vector shuffles.
13709 /// This handles decomposition, canonicalization, and lowering of all x86
13710 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13711 /// above in helper routines. The canonicalization attempts to widen shuffles
13712 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13713 /// s.t. only one of the two inputs needs to be tested, etc.
13714 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13715 SelectionDAG &DAG) {
13716 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13717 ArrayRef<int> Mask = SVOp->getMask();
13718 SDValue V1 = Op.getOperand(0);
13719 SDValue V2 = Op.getOperand(1);
13720 MVT VT = Op.getSimpleValueType();
13721 int NumElements = VT.getVectorNumElements();
13723 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13725 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13726 "Can't lower MMX shuffles");
13728 bool V1IsUndef = V1.isUndef();
13729 bool V2IsUndef = V2.isUndef();
13730 if (V1IsUndef && V2IsUndef)
13731 return DAG.getUNDEF(VT);
13733 // When we create a shuffle node we put the UNDEF node to second operand,
13734 // but in some cases the first operand may be transformed to UNDEF.
13735 // In this case we should just commute the node.
13737 return DAG.getCommutedVectorShuffle(*SVOp);
13739 // Check for non-undef masks pointing at an undef vector and make the masks
13740 // undef as well. This makes it easier to match the shuffle based solely on
13744 if (M >= NumElements) {
13745 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13746 for (int &M : NewMask)
13747 if (M >= NumElements)
13749 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13752 // Check for illegal shuffle mask element index values.
13753 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13754 assert(llvm::all_of(Mask,
13755 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13756 "Out of bounds shuffle index");
13758 // We actually see shuffles that are entirely re-arrangements of a set of
13759 // zero inputs. This mostly happens while decomposing complex shuffles into
13760 // simple ones. Directly lower these as a buildvector of zeros.
13761 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13762 if (Zeroable.isAllOnesValue())
13763 return getZeroVector(VT, Subtarget, DAG, DL);
13765 // Try to collapse shuffles into using a vector type with fewer elements but
13766 // wider element types. We cap this to not form integers or floating point
13767 // elements wider than 64 bits, but it might be interesting to form i128
13768 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13769 SmallVector<int, 16> WidenedMask;
13770 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13771 canWidenShuffleElements(Mask, WidenedMask)) {
13772 MVT NewEltVT = VT.isFloatingPoint()
13773 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13774 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13775 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13776 // Make sure that the new vector type is legal. For example, v2f64 isn't
13778 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13779 V1 = DAG.getBitcast(NewVT, V1);
13780 V2 = DAG.getBitcast(NewVT, V2);
13781 return DAG.getBitcast(
13782 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13786 // Commute the shuffle if it will improve canonicalization.
13787 if (canonicalizeShuffleMaskWithCommute(Mask))
13788 return DAG.getCommutedVectorShuffle(*SVOp);
13790 // For each vector width, delegate to a specialized lowering routine.
13791 if (VT.is128BitVector())
13792 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13795 if (VT.is256BitVector())
13796 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13799 if (VT.is512BitVector())
13800 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13804 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13806 llvm_unreachable("Unimplemented!");
13809 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13810 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13811 const X86Subtarget &Subtarget,
13812 SelectionDAG &DAG) {
13813 SDValue Cond = Op.getOperand(0);
13814 SDValue LHS = Op.getOperand(1);
13815 SDValue RHS = Op.getOperand(2);
13817 MVT VT = Op.getSimpleValueType();
13819 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13821 auto *CondBV = cast<BuildVectorSDNode>(Cond);
13823 // Only non-legal VSELECTs reach this lowering, convert those into generic
13824 // shuffles and re-use the shuffle lowering path for blends.
13825 SmallVector<int, 32> Mask;
13826 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13827 SDValue CondElt = CondBV->getOperand(i);
13829 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13832 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13835 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13836 // A vselect where all conditions and data are constants can be optimized into
13837 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13838 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13839 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13840 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13843 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
13844 // with patterns on the mask registers on AVX-512.
13845 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
13848 // Try to lower this to a blend-style vector shuffle. This can handle all
13849 // constant condition cases.
13850 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13853 // Variable blends are only legal from SSE4.1 onward.
13854 if (!Subtarget.hasSSE41())
13858 MVT VT = Op.getSimpleValueType();
13860 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
13861 // into an i1 condition so that we can use the mask-based 512-bit blend
13863 if (VT.getSizeInBits() == 512) {
13864 SDValue Cond = Op.getOperand(0);
13865 // The vNi1 condition case should be handled above as it can be trivially
13867 assert(Cond.getValueType().getScalarSizeInBits() ==
13868 VT.getScalarSizeInBits() &&
13869 "Should have a size-matched integer condition!");
13870 // Build a mask by testing the condition against itself (tests for zero).
13871 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
13872 SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
13873 // Now return a new VSELECT using the mask.
13874 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
13877 // Only some types will be legal on some subtargets. If we can emit a legal
13878 // VSELECT-matching blend, return Op, and but if we need to expand, return
13880 switch (VT.SimpleTy) {
13882 // Most of the vector types have blends past SSE4.1.
13886 // The byte blends for AVX vectors were introduced only in AVX2.
13887 if (Subtarget.hasAVX2())
13894 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
13895 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13898 // FIXME: We should custom lower this by fixing the condition and using i8
13904 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13905 MVT VT = Op.getSimpleValueType();
13908 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13911 if (VT.getSizeInBits() == 8) {
13912 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13913 Op.getOperand(0), Op.getOperand(1));
13914 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13915 DAG.getValueType(VT));
13916 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13919 if (VT == MVT::f32) {
13920 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13921 // the result back to FR32 register. It's only worth matching if the
13922 // result has a single use which is a store or a bitcast to i32. And in
13923 // the case of a store, it's not worth it if the index is a constant 0,
13924 // because a MOVSSmr can be used instead, which is smaller and faster.
13925 if (!Op.hasOneUse())
13927 SDNode *User = *Op.getNode()->use_begin();
13928 if ((User->getOpcode() != ISD::STORE ||
13929 isNullConstant(Op.getOperand(1))) &&
13930 (User->getOpcode() != ISD::BITCAST ||
13931 User->getValueType(0) != MVT::i32))
13933 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13934 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
13936 return DAG.getBitcast(MVT::f32, Extract);
13939 if (VT == MVT::i32 || VT == MVT::i64) {
13940 // ExtractPS/pextrq works with constant index.
13941 if (isa<ConstantSDNode>(Op.getOperand(1)))
13948 /// Extract one bit from mask vector, like v16i1 or v8i1.
13949 /// AVX-512 feature.
13951 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13952 SDValue Vec = Op.getOperand(0);
13954 MVT VecVT = Vec.getSimpleValueType();
13955 SDValue Idx = Op.getOperand(1);
13956 MVT EltVT = Op.getSimpleValueType();
13958 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
13959 "Unexpected vector type in ExtractBitFromMaskVector");
13961 // variable index can't be handled in mask registers,
13962 // extend vector to VR512/128
13963 if (!isa<ConstantSDNode>(Idx)) {
13964 unsigned NumElts = VecVT.getVectorNumElements();
13965 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
13966 // than extending to 128/256bit.
13967 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
13968 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
13969 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
13970 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13971 ExtVT.getVectorElementType(), Ext, Idx);
13972 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13975 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13976 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
13977 (VecVT.getVectorNumElements() < 8)) {
13978 // Use kshiftlw/rw instruction.
13979 VecVT = MVT::v16i1;
13980 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
13981 DAG.getUNDEF(VecVT),
13983 DAG.getIntPtrConstant(0, dl));
13985 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
13986 if (MaxSift - IdxVal)
13987 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
13988 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
13989 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
13990 DAG.getConstant(MaxSift, dl, MVT::i8));
13991 return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
13992 DAG.getIntPtrConstant(0, dl));
13996 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13997 SelectionDAG &DAG) const {
13999 SDValue Vec = Op.getOperand(0);
14000 MVT VecVT = Vec.getSimpleValueType();
14001 SDValue Idx = Op.getOperand(1);
14003 if (VecVT.getVectorElementType() == MVT::i1)
14004 return ExtractBitFromMaskVector(Op, DAG);
14006 if (!isa<ConstantSDNode>(Idx)) {
14007 // Its more profitable to go through memory (1 cycles throughput)
14008 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14009 // IACA tool was used to get performance estimation
14010 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14012 // example : extractelement <16 x i8> %a, i32 %i
14014 // Block Throughput: 3.00 Cycles
14015 // Throughput Bottleneck: Port5
14017 // | Num Of | Ports pressure in cycles | |
14018 // | Uops | 0 - DV | 5 | 6 | 7 | |
14019 // ---------------------------------------------
14020 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14021 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14022 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14023 // Total Num Of Uops: 4
14026 // Block Throughput: 1.00 Cycles
14027 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14029 // | | Ports pressure in cycles | |
14030 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14031 // ---------------------------------------------------------
14032 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14033 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14034 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14035 // Total Num Of Uops: 4
14040 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14042 // If this is a 256-bit vector result, first extract the 128-bit vector and
14043 // then extract the element from the 128-bit vector.
14044 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14045 // Get the 128-bit vector.
14046 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14047 MVT EltVT = VecVT.getVectorElementType();
14049 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14050 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14052 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14053 // this can be done with a mask.
14054 IdxVal &= ElemsPerChunk - 1;
14055 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14056 DAG.getConstant(IdxVal, dl, MVT::i32));
14059 assert(VecVT.is128BitVector() && "Unexpected vector length");
14061 MVT VT = Op.getSimpleValueType();
14063 if (VT.getSizeInBits() == 16) {
14064 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14065 // we're going to zero extend the register or fold the store (SSE41 only).
14066 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14067 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14068 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14069 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14070 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14072 // Transform it so it match pextrw which produces a 32-bit result.
14073 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14074 Op.getOperand(0), Op.getOperand(1));
14075 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14076 DAG.getValueType(VT));
14077 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14080 if (Subtarget.hasSSE41())
14081 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14084 // TODO: We only extract a single element from v16i8, we can probably afford
14085 // to be more aggressive here before using the default approach of spilling to
14087 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14088 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14089 int DWordIdx = IdxVal / 4;
14090 if (DWordIdx == 0) {
14091 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14092 DAG.getBitcast(MVT::v4i32, Vec),
14093 DAG.getIntPtrConstant(DWordIdx, dl));
14094 int ShiftVal = (IdxVal % 4) * 8;
14096 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14097 DAG.getConstant(ShiftVal, dl, MVT::i32));
14098 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14101 int WordIdx = IdxVal / 2;
14102 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14103 DAG.getBitcast(MVT::v8i16, Vec),
14104 DAG.getIntPtrConstant(WordIdx, dl));
14105 int ShiftVal = (IdxVal % 2) * 8;
14107 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14108 DAG.getConstant(ShiftVal, dl, MVT::i16));
14109 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14112 if (VT.getSizeInBits() == 32) {
14116 // SHUFPS the element to the lowest double word, then movss.
14117 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14118 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14119 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14120 DAG.getIntPtrConstant(0, dl));
14123 if (VT.getSizeInBits() == 64) {
14124 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14125 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14126 // to match extract_elt for f64.
14130 // UNPCKHPD the element to the lowest double word, then movsd.
14131 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14132 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14133 int Mask[2] = { 1, -1 };
14134 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14135 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14136 DAG.getIntPtrConstant(0, dl));
14142 /// Insert one bit to mask vector, like v16i1 or v8i1.
14143 /// AVX-512 feature.
14145 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14147 SDValue Vec = Op.getOperand(0);
14148 SDValue Elt = Op.getOperand(1);
14149 SDValue Idx = Op.getOperand(2);
14150 MVT VecVT = Vec.getSimpleValueType();
14152 if (!isa<ConstantSDNode>(Idx)) {
14153 // Non constant index. Extend source and destination,
14154 // insert element and then truncate the result.
14155 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
14156 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
14157 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14158 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14159 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14160 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14163 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14164 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14165 unsigned NumElems = VecVT.getVectorNumElements();
14167 if(Vec.isUndef()) {
14169 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14170 DAG.getConstant(IdxVal, dl, MVT::i8));
14174 // Insertion of one bit into first position
14175 if (IdxVal == 0 ) {
14176 // Clean top bits of vector.
14177 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14178 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14179 EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14180 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14181 // Clean the first bit in source vector.
14182 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14183 DAG.getConstant(1 , dl, MVT::i8));
14184 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14185 DAG.getConstant(1, dl, MVT::i8));
14187 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14189 // Insertion of one bit into last position
14190 if (IdxVal == NumElems -1) {
14191 // Move the bit to the last position inside the vector.
14192 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14193 DAG.getConstant(IdxVal, dl, MVT::i8));
14194 // Clean the last bit in the source vector.
14195 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14196 DAG.getConstant(1, dl, MVT::i8));
14197 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14198 DAG.getConstant(1 , dl, MVT::i8));
14200 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14203 // Use shuffle to insert element.
14204 SmallVector<int, 64> MaskVec(NumElems);
14205 for (unsigned i = 0; i != NumElems; ++i)
14206 MaskVec[i] = (i == IdxVal) ? NumElems : i;
14208 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14211 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14212 SelectionDAG &DAG) const {
14213 MVT VT = Op.getSimpleValueType();
14214 MVT EltVT = VT.getVectorElementType();
14215 unsigned NumElts = VT.getVectorNumElements();
14217 if (EltVT == MVT::i1)
14218 return InsertBitToMaskVector(Op, DAG);
14221 SDValue N0 = Op.getOperand(0);
14222 SDValue N1 = Op.getOperand(1);
14223 SDValue N2 = Op.getOperand(2);
14224 if (!isa<ConstantSDNode>(N2))
14226 auto *N2C = cast<ConstantSDNode>(N2);
14227 unsigned IdxVal = N2C->getZExtValue();
14229 bool IsZeroElt = X86::isZeroNode(N1);
14230 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14232 // If we are inserting a element, see if we can do this more efficiently with
14233 // a blend shuffle with a rematerializable vector than a costly integer
14235 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
14236 // be beneficial if we are inserting several zeros and can combine the masks.
14237 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
14238 SmallVector<int, 8> BlendMask;
14239 for (unsigned i = 0; i != NumElts; ++i)
14240 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14241 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14242 : DAG.getConstant(-1, dl, VT);
14243 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14246 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14247 // into that, and then insert the subvector back into the result.
14248 if (VT.is256BitVector() || VT.is512BitVector()) {
14249 // With a 256-bit vector, we can insert into the zero element efficiently
14250 // using a blend if we have AVX or AVX2 and the right data type.
14251 if (VT.is256BitVector() && IdxVal == 0) {
14252 // TODO: It is worthwhile to cast integer to floating point and back
14253 // and incur a domain crossing penalty if that's what we'll end up
14254 // doing anyway after extracting to a 128-bit vector.
14255 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14256 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14257 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14258 N2 = DAG.getIntPtrConstant(1, dl);
14259 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14263 // Get the desired 128-bit vector chunk.
14264 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14266 // Insert the element into the desired chunk.
14267 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14268 assert(isPowerOf2_32(NumEltsIn128));
14269 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14270 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14272 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14273 DAG.getConstant(IdxIn128, dl, MVT::i32));
14275 // Insert the changed part back into the bigger vector
14276 return insert128BitVector(N0, V, IdxVal, DAG, dl);
14278 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14280 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14281 // argument. SSE41 required for pinsrb.
14282 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14284 if (VT == MVT::v8i16) {
14285 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14286 Opc = X86ISD::PINSRW;
14288 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14289 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14290 Opc = X86ISD::PINSRB;
14293 if (N1.getValueType() != MVT::i32)
14294 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14295 if (N2.getValueType() != MVT::i32)
14296 N2 = DAG.getIntPtrConstant(IdxVal, dl);
14297 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14300 if (Subtarget.hasSSE41()) {
14301 if (EltVT == MVT::f32) {
14302 // Bits [7:6] of the constant are the source select. This will always be
14303 // zero here. The DAG Combiner may combine an extract_elt index into
14304 // these bits. For example (insert (extract, 3), 2) could be matched by
14305 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14306 // Bits [5:4] of the constant are the destination select. This is the
14307 // value of the incoming immediate.
14308 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14309 // combine either bitwise AND or insert of float 0.0 to set these bits.
14311 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14312 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14313 // If this is an insertion of 32-bits into the low 32-bits of
14314 // a vector, we prefer to generate a blend with immediate rather
14315 // than an insertps. Blends are simpler operations in hardware and so
14316 // will always have equal or better performance than insertps.
14317 // But if optimizing for size and there's a load folding opportunity,
14318 // generate insertps because blendps does not have a 32-bit memory
14320 N2 = DAG.getIntPtrConstant(1, dl);
14321 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14322 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14324 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14325 // Create this as a scalar to vector..
14326 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14327 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14330 // PINSR* works with constant index.
14331 if (EltVT == MVT::i32 || EltVT == MVT::i64)
14338 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14339 SelectionDAG &DAG) {
14341 MVT OpVT = Op.getSimpleValueType();
14343 // It's always cheaper to replace a xor+movd with xorps and simplifies further
14345 if (X86::isZeroNode(Op.getOperand(0)))
14346 return getZeroVector(OpVT, Subtarget, DAG, dl);
14348 // If this is a 256-bit vector result, first insert into a 128-bit
14349 // vector and then insert into the 256-bit vector.
14350 if (!OpVT.is128BitVector()) {
14351 // Insert into a 128-bit vector.
14352 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14353 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14354 OpVT.getVectorNumElements() / SizeFactor);
14356 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14358 // Insert the 128-bit vector.
14359 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14361 assert(OpVT.is128BitVector() && "Expected an SSE type!");
14363 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14364 if (OpVT == MVT::v4i32)
14367 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14368 return DAG.getBitcast(
14369 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14372 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
14373 // a simple subregister reference or explicit instructions to grab
14374 // upper bits of a vector.
14375 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14376 SelectionDAG &DAG) {
14377 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
14380 SDValue In = Op.getOperand(0);
14381 SDValue Idx = Op.getOperand(1);
14382 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14383 MVT ResVT = Op.getSimpleValueType();
14385 assert((In.getSimpleValueType().is256BitVector() ||
14386 In.getSimpleValueType().is512BitVector()) &&
14387 "Can only extract from 256-bit or 512-bit vectors");
14389 // If the input is a buildvector just emit a smaller one.
14390 unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14391 if (In.getOpcode() == ISD::BUILD_VECTOR)
14392 return DAG.getBuildVector(
14393 ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14395 // Everything else is legal.
14399 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
14400 // simple superregister reference or explicit instructions to insert
14401 // the upper bits of a vector.
14402 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14403 SelectionDAG &DAG) {
14404 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14406 return insert1BitVector(Op, DAG, Subtarget);
14409 // Returns the appropriate wrapper opcode for a global reference.
14410 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14411 // References to absolute symbols are never PC-relative.
14412 if (GV && GV->isAbsoluteSymbolRef())
14413 return X86ISD::Wrapper;
14415 CodeModel::Model M = getTargetMachine().getCodeModel();
14416 if (Subtarget.isPICStyleRIPRel() &&
14417 (M == CodeModel::Small || M == CodeModel::Kernel))
14418 return X86ISD::WrapperRIP;
14420 return X86ISD::Wrapper;
14423 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14424 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14425 // one of the above mentioned nodes. It has to be wrapped because otherwise
14426 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14427 // be used to form addressing mode. These wrapped nodes will be selected
14430 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14431 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14433 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14434 // global base reg.
14435 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14437 auto PtrVT = getPointerTy(DAG.getDataLayout());
14438 SDValue Result = DAG.getTargetConstantPool(
14439 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14441 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14442 // With PIC, the address is actually $g + Offset.
14445 DAG.getNode(ISD::ADD, DL, PtrVT,
14446 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14452 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14453 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14455 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14456 // global base reg.
14457 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14459 auto PtrVT = getPointerTy(DAG.getDataLayout());
14460 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14462 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14464 // With PIC, the address is actually $g + Offset.
14467 DAG.getNode(ISD::ADD, DL, PtrVT,
14468 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14474 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14475 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14477 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14478 // global base reg.
14479 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14480 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14482 auto PtrVT = getPointerTy(DAG.getDataLayout());
14483 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14486 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14488 // With PIC, the address is actually $g + Offset.
14489 if (isPositionIndependent() && !Subtarget.is64Bit()) {
14491 DAG.getNode(ISD::ADD, DL, PtrVT,
14492 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14495 // For symbols that require a load from a stub to get the address, emit the
14497 if (isGlobalStubReference(OpFlag))
14498 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14499 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14505 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14506 // Create the TargetBlockAddressAddress node.
14507 unsigned char OpFlags =
14508 Subtarget.classifyBlockAddressReference();
14509 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14510 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14512 auto PtrVT = getPointerTy(DAG.getDataLayout());
14513 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14514 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14516 // With PIC, the address is actually $g + Offset.
14517 if (isGlobalRelativeToPICBase(OpFlags)) {
14518 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14519 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14525 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14526 const SDLoc &dl, int64_t Offset,
14527 SelectionDAG &DAG) const {
14528 // Create the TargetGlobalAddress node, folding in the constant
14529 // offset if it is legal.
14530 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14531 CodeModel::Model M = DAG.getTarget().getCodeModel();
14532 auto PtrVT = getPointerTy(DAG.getDataLayout());
14534 if (OpFlags == X86II::MO_NO_FLAG &&
14535 X86::isOffsetSuitableForCodeModel(Offset, M)) {
14536 // A direct static reference to a global.
14537 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14540 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14543 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14545 // With PIC, the address is actually $g + Offset.
14546 if (isGlobalRelativeToPICBase(OpFlags)) {
14547 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14548 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14551 // For globals that require a load from a stub to get the address, emit the
14553 if (isGlobalStubReference(OpFlags))
14554 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14555 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14557 // If there was a non-zero offset that we didn't fold, create an explicit
14558 // addition for it.
14560 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14561 DAG.getConstant(Offset, dl, PtrVT));
14567 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14568 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14569 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14570 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14574 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14575 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14576 unsigned char OperandFlags, bool LocalDynamic = false) {
14577 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14578 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14580 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14581 GA->getValueType(0),
14585 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14589 SDValue Ops[] = { Chain, TGA, *InFlag };
14590 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14592 SDValue Ops[] = { Chain, TGA };
14593 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14596 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14597 MFI.setAdjustsStack(true);
14598 MFI.setHasCalls(true);
14600 SDValue Flag = Chain.getValue(1);
14601 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14604 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14606 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14609 SDLoc dl(GA); // ? function entry point might be better
14610 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14611 DAG.getNode(X86ISD::GlobalBaseReg,
14612 SDLoc(), PtrVT), InFlag);
14613 InFlag = Chain.getValue(1);
14615 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14618 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14620 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14622 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14623 X86::RAX, X86II::MO_TLSGD);
14626 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14632 // Get the start address of the TLS block for this module.
14633 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14634 .getInfo<X86MachineFunctionInfo>();
14635 MFI->incNumLocalDynamicTLSAccesses();
14639 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14640 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14643 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14644 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14645 InFlag = Chain.getValue(1);
14646 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14647 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14650 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14654 unsigned char OperandFlags = X86II::MO_DTPOFF;
14655 unsigned WrapperKind = X86ISD::Wrapper;
14656 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14657 GA->getValueType(0),
14658 GA->getOffset(), OperandFlags);
14659 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14661 // Add x@dtpoff with the base.
14662 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14665 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14666 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14667 const EVT PtrVT, TLSModel::Model model,
14668 bool is64Bit, bool isPIC) {
14671 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14672 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14673 is64Bit ? 257 : 256));
14675 SDValue ThreadPointer =
14676 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14677 MachinePointerInfo(Ptr));
14679 unsigned char OperandFlags = 0;
14680 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14682 unsigned WrapperKind = X86ISD::Wrapper;
14683 if (model == TLSModel::LocalExec) {
14684 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14685 } else if (model == TLSModel::InitialExec) {
14687 OperandFlags = X86II::MO_GOTTPOFF;
14688 WrapperKind = X86ISD::WrapperRIP;
14690 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14693 llvm_unreachable("Unexpected model");
14696 // emit "addl x@ntpoff,%eax" (local exec)
14697 // or "addl x@indntpoff,%eax" (initial exec)
14698 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14700 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14701 GA->getOffset(), OperandFlags);
14702 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14704 if (model == TLSModel::InitialExec) {
14705 if (isPIC && !is64Bit) {
14706 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14707 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14711 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14712 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14715 // The address of the thread local variable is the add of the thread
14716 // pointer with the offset of the variable.
14717 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14721 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14723 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14725 if (DAG.getTarget().Options.EmulatedTLS)
14726 return LowerToTLSEmulatedModel(GA, DAG);
14728 const GlobalValue *GV = GA->getGlobal();
14729 auto PtrVT = getPointerTy(DAG.getDataLayout());
14730 bool PositionIndependent = isPositionIndependent();
14732 if (Subtarget.isTargetELF()) {
14733 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14735 case TLSModel::GeneralDynamic:
14736 if (Subtarget.is64Bit())
14737 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14738 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14739 case TLSModel::LocalDynamic:
14740 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14741 Subtarget.is64Bit());
14742 case TLSModel::InitialExec:
14743 case TLSModel::LocalExec:
14744 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14745 PositionIndependent);
14747 llvm_unreachable("Unknown TLS model.");
14750 if (Subtarget.isTargetDarwin()) {
14751 // Darwin only has one model of TLS. Lower to that.
14752 unsigned char OpFlag = 0;
14753 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14754 X86ISD::WrapperRIP : X86ISD::Wrapper;
14756 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14757 // global base reg.
14758 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14760 OpFlag = X86II::MO_TLVP_PIC_BASE;
14762 OpFlag = X86II::MO_TLVP;
14764 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14765 GA->getValueType(0),
14766 GA->getOffset(), OpFlag);
14767 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14769 // With PIC32, the address is actually $g + Offset.
14771 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14772 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14775 // Lowering the machine isd will make sure everything is in the right
14777 SDValue Chain = DAG.getEntryNode();
14778 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14779 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
14780 SDValue Args[] = { Chain, Offset };
14781 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14782 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14783 DAG.getIntPtrConstant(0, DL, true),
14784 Chain.getValue(1), DL);
14786 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14787 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14788 MFI.setAdjustsStack(true);
14790 // And our return value (tls address) is in the standard call return value
14792 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14793 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14796 if (Subtarget.isTargetKnownWindowsMSVC() ||
14797 Subtarget.isTargetWindowsItanium() ||
14798 Subtarget.isTargetWindowsGNU()) {
14799 // Just use the implicit TLS architecture
14800 // Need to generate something similar to:
14801 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14803 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14804 // mov rcx, qword [rdx+rcx*8]
14805 // mov eax, .tls$:tlsvar
14806 // [rax+rcx] contains the address
14807 // Windows 64bit: gs:0x58
14808 // Windows 32bit: fs:__tls_array
14811 SDValue Chain = DAG.getEntryNode();
14813 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14814 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14815 // use its literal value of 0x2C.
14816 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14817 ? Type::getInt8PtrTy(*DAG.getContext(),
14819 : Type::getInt32PtrTy(*DAG.getContext(),
14822 SDValue TlsArray = Subtarget.is64Bit()
14823 ? DAG.getIntPtrConstant(0x58, dl)
14824 : (Subtarget.isTargetWindowsGNU()
14825 ? DAG.getIntPtrConstant(0x2C, dl)
14826 : DAG.getExternalSymbol("_tls_array", PtrVT));
14828 SDValue ThreadPointer =
14829 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14832 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14833 res = ThreadPointer;
14835 // Load the _tls_index variable
14836 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14837 if (Subtarget.is64Bit())
14838 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14839 MachinePointerInfo(), MVT::i32);
14841 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14843 auto &DL = DAG.getDataLayout();
14845 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14846 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14848 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14851 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14853 // Get the offset of start of .tls section
14854 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14855 GA->getValueType(0),
14856 GA->getOffset(), X86II::MO_SECREL);
14857 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14859 // The address of the thread local variable is the add of the thread
14860 // pointer with the offset of the variable.
14861 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14864 llvm_unreachable("TLS not implemented for this target.");
14867 /// Lower SRA_PARTS and friends, which return two i32 values
14868 /// and take a 2 x i32 value to shift plus a shift amount.
14869 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14870 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14871 MVT VT = Op.getSimpleValueType();
14872 unsigned VTBits = VT.getSizeInBits();
14874 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14875 SDValue ShOpLo = Op.getOperand(0);
14876 SDValue ShOpHi = Op.getOperand(1);
14877 SDValue ShAmt = Op.getOperand(2);
14878 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14879 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14881 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14882 DAG.getConstant(VTBits - 1, dl, MVT::i8));
14883 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14884 DAG.getConstant(VTBits - 1, dl, MVT::i8))
14885 : DAG.getConstant(0, dl, VT);
14887 SDValue Tmp2, Tmp3;
14888 if (Op.getOpcode() == ISD::SHL_PARTS) {
14889 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14890 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14892 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14893 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14896 // If the shift amount is larger or equal than the width of a part we can't
14897 // rely on the results of shld/shrd. Insert a test and select the appropriate
14898 // values for large shift amounts.
14899 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14900 DAG.getConstant(VTBits, dl, MVT::i8));
14901 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14902 AndNode, DAG.getConstant(0, dl, MVT::i8));
14905 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14906 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14907 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14909 if (Op.getOpcode() == ISD::SHL_PARTS) {
14910 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14911 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14913 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14914 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14917 SDValue Ops[2] = { Lo, Hi };
14918 return DAG.getMergeValues(Ops, dl);
14921 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14922 SelectionDAG &DAG) const {
14923 SDValue Src = Op.getOperand(0);
14924 MVT SrcVT = Src.getSimpleValueType();
14925 MVT VT = Op.getSimpleValueType();
14928 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14929 if (SrcVT.isVector()) {
14930 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
14931 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
14932 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
14933 DAG.getUNDEF(SrcVT)));
14935 if (SrcVT.getVectorElementType() == MVT::i1) {
14936 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
14937 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14938 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
14939 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14940 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14941 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
14946 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14947 "Unknown SINT_TO_FP to lower!");
14949 // These are really Legal; return the operand so the caller accepts it as
14951 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14953 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14954 Subtarget.is64Bit()) {
14958 SDValue ValueToStore = Op.getOperand(0);
14959 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14960 !Subtarget.is64Bit())
14961 // Bitcasting to f64 here allows us to do a single 64-bit store from
14962 // an SSE register, avoiding the store forwarding penalty that would come
14963 // with two 32-bit stores.
14964 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14966 unsigned Size = SrcVT.getSizeInBits()/8;
14967 MachineFunction &MF = DAG.getMachineFunction();
14968 auto PtrVT = getPointerTy(MF.getDataLayout());
14969 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
14970 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14971 SDValue Chain = DAG.getStore(
14972 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14973 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14974 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14977 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14979 SelectionDAG &DAG) const {
14983 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14985 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14987 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14989 unsigned ByteSize = SrcVT.getSizeInBits()/8;
14991 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14992 MachineMemOperand *MMO;
14994 int SSFI = FI->getIndex();
14995 MMO = DAG.getMachineFunction().getMachineMemOperand(
14996 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14997 MachineMemOperand::MOLoad, ByteSize, ByteSize);
14999 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15000 StackSlot = StackSlot.getOperand(1);
15002 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15003 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15005 Tys, Ops, SrcVT, MMO);
15008 Chain = Result.getValue(1);
15009 SDValue InFlag = Result.getValue(2);
15011 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15012 // shouldn't be necessary except that RFP cannot be live across
15013 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15014 MachineFunction &MF = DAG.getMachineFunction();
15015 unsigned SSFISize = Op.getValueSizeInBits()/8;
15016 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15017 auto PtrVT = getPointerTy(MF.getDataLayout());
15018 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15019 Tys = DAG.getVTList(MVT::Other);
15021 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15023 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15024 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15025 MachineMemOperand::MOStore, SSFISize, SSFISize);
15027 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15028 Ops, Op.getValueType(), MMO);
15029 Result = DAG.getLoad(
15030 Op.getValueType(), DL, Chain, StackSlot,
15031 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15037 /// 64-bit unsigned integer to double expansion.
15038 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15039 SelectionDAG &DAG) const {
15040 // This algorithm is not obvious. Here it is what we're trying to output:
15043 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15044 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15046 haddpd %xmm0, %xmm0
15048 pshufd $0x4e, %xmm0, %xmm1
15054 LLVMContext *Context = DAG.getContext();
15056 // Build some magic constants.
15057 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15058 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15059 auto PtrVT = getPointerTy(DAG.getDataLayout());
15060 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15062 SmallVector<Constant*,2> CV1;
15064 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15065 APInt(64, 0x4330000000000000ULL))));
15067 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15068 APInt(64, 0x4530000000000000ULL))));
15069 Constant *C1 = ConstantVector::get(CV1);
15070 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15072 // Load the 64-bit value into an XMM register.
15073 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15076 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15077 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15078 /* Alignment = */ 16);
15080 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15083 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15084 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15085 /* Alignment = */ 16);
15086 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15087 // TODO: Are there any fast-math-flags to propagate here?
15088 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15091 if (Subtarget.hasSSE3()) {
15092 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15093 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15095 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15096 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15097 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15098 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15101 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15102 DAG.getIntPtrConstant(0, dl));
15105 /// 32-bit unsigned integer to float expansion.
15106 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15107 SelectionDAG &DAG) const {
15109 // FP constant to bias correct the final result.
15110 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15113 // Load the 32-bit value into an XMM register.
15114 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15117 // Zero out the upper parts of the register.
15118 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15120 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15121 DAG.getBitcast(MVT::v2f64, Load),
15122 DAG.getIntPtrConstant(0, dl));
15124 // Or the load with the bias.
15125 SDValue Or = DAG.getNode(
15126 ISD::OR, dl, MVT::v2i64,
15127 DAG.getBitcast(MVT::v2i64,
15128 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15129 DAG.getBitcast(MVT::v2i64,
15130 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15132 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15133 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15135 // Subtract the bias.
15136 // TODO: Are there any fast-math-flags to propagate here?
15137 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15139 // Handle final rounding.
15140 MVT DestVT = Op.getSimpleValueType();
15142 if (DestVT.bitsLT(MVT::f64))
15143 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15144 DAG.getIntPtrConstant(0, dl));
15145 if (DestVT.bitsGT(MVT::f64))
15146 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15148 // Handle final rounding.
15152 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15153 const X86Subtarget &Subtarget, SDLoc &DL) {
15154 if (Op.getSimpleValueType() != MVT::v2f64)
15157 SDValue N0 = Op.getOperand(0);
15158 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15160 // Legalize to v4i32 type.
15161 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15162 DAG.getUNDEF(MVT::v2i32));
15164 if (Subtarget.hasAVX512())
15165 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15167 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15168 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15169 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15170 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15172 // Two to the power of half-word-size.
15173 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15175 // Clear upper part of LO, lower HI.
15176 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15177 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15179 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15180 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15181 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15183 // Add the two halves.
15184 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15187 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15188 const X86Subtarget &Subtarget) {
15189 // The algorithm is the following:
15190 // #ifdef __SSE4_1__
15191 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15192 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15193 // (uint4) 0x53000000, 0xaa);
15195 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15196 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15198 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15199 // return (float4) lo + fhi;
15201 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15202 // reassociate the two FADDs, and if we do that, the algorithm fails
15203 // spectacularly (PR24512).
15204 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15205 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15206 // there's also the MachineCombiner reassociations happening on Machine IR.
15207 if (DAG.getTarget().Options.UnsafeFPMath)
15211 SDValue V = Op->getOperand(0);
15212 MVT VecIntVT = V.getSimpleValueType();
15213 bool Is128 = VecIntVT == MVT::v4i32;
15214 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15215 // If we convert to something else than the supported type, e.g., to v4f64,
15217 if (VecFloatVT != Op->getSimpleValueType(0))
15220 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15221 "Unsupported custom type");
15223 // In the #idef/#else code, we have in common:
15224 // - The vector of constants:
15230 // Create the splat vector for 0x4b000000.
15231 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15232 // Create the splat vector for 0x53000000.
15233 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15235 // Create the right shift.
15236 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15237 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15240 if (Subtarget.hasSSE41()) {
15241 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15242 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15243 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15244 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15245 // Low will be bitcasted right away, so do not bother bitcasting back to its
15247 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15248 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15249 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15250 // (uint4) 0x53000000, 0xaa);
15251 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15252 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15253 // High will be bitcasted right away, so do not bother bitcasting back to
15254 // its original type.
15255 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15256 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15258 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15259 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15260 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15261 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15263 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15264 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15267 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15268 SDValue VecCstFAdd = DAG.getConstantFP(
15269 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15271 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15272 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15273 // TODO: Are there any fast-math-flags to propagate here?
15275 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15276 // return (float4) lo + fhi;
15277 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15278 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15281 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15282 SelectionDAG &DAG) const {
15283 SDValue N0 = Op.getOperand(0);
15284 MVT SrcVT = N0.getSimpleValueType();
15287 if (SrcVT.getVectorElementType() == MVT::i1) {
15288 if (SrcVT == MVT::v2i1)
15289 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15290 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15291 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15292 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15293 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15296 switch (SrcVT.SimpleTy) {
15298 llvm_unreachable("Custom UINT_TO_FP is not supported!");
15303 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15304 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15305 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15308 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15311 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15314 assert(Subtarget.hasAVX512());
15315 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15316 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15320 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15321 SelectionDAG &DAG) const {
15322 SDValue N0 = Op.getOperand(0);
15324 auto PtrVT = getPointerTy(DAG.getDataLayout());
15326 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15327 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15328 // the optimization here.
15329 if (DAG.SignBitIsZero(N0))
15330 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15332 if (Op.getSimpleValueType().isVector())
15333 return lowerUINT_TO_FP_vec(Op, DAG);
15335 MVT SrcVT = N0.getSimpleValueType();
15336 MVT DstVT = Op.getSimpleValueType();
15338 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15339 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15340 // Conversions from unsigned i32 to f32/f64 are legal,
15341 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15345 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15346 return LowerUINT_TO_FP_i64(Op, DAG);
15347 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15348 return LowerUINT_TO_FP_i32(Op, DAG);
15349 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15352 // Make a 64-bit buffer, and use it to build an FILD.
15353 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15354 if (SrcVT == MVT::i32) {
15355 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15356 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15357 StackSlot, MachinePointerInfo());
15358 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15359 OffsetSlot, MachinePointerInfo());
15360 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15364 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15365 SDValue ValueToStore = Op.getOperand(0);
15366 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15367 // Bitcasting to f64 here allows us to do a single 64-bit store from
15368 // an SSE register, avoiding the store forwarding penalty that would come
15369 // with two 32-bit stores.
15370 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15371 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15372 MachinePointerInfo());
15373 // For i64 source, we need to add the appropriate power of 2 if the input
15374 // was negative. This is the same as the optimization in
15375 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15376 // we must be careful to do the computation in x87 extended precision, not
15377 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15378 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15379 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15380 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15381 MachineMemOperand::MOLoad, 8, 8);
15383 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15384 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15385 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15388 APInt FF(32, 0x5F800000ULL);
15390 // Check whether the sign bit is set.
15391 SDValue SignSet = DAG.getSetCC(
15392 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15393 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15395 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15396 SDValue FudgePtr = DAG.getConstantPool(
15397 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15399 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15400 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15401 SDValue Four = DAG.getIntPtrConstant(4, dl);
15402 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15403 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15405 // Load the value out, extending it from f32 to f80.
15406 // FIXME: Avoid the extend by constructing the right constant pool?
15407 SDValue Fudge = DAG.getExtLoad(
15408 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15409 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15410 /* Alignment = */ 4);
15411 // Extend everything to 80 bits to force it to be done on x87.
15412 // TODO: Are there any fast-math-flags to propagate here?
15413 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15414 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15415 DAG.getIntPtrConstant(0, dl));
15418 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15419 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15420 // just return an <SDValue(), SDValue()> pair.
15421 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15422 // to i16, i32 or i64, and we lower it to a legal sequence.
15423 // If lowered to the final integer result we return a <result, SDValue()> pair.
15424 // Otherwise we lower it to a sequence ending with a FIST, return a
15425 // <FIST, StackSlot> pair, and the caller is responsible for loading
15426 // the final integer result from StackSlot.
15427 std::pair<SDValue,SDValue>
15428 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15429 bool IsSigned, bool IsReplace) const {
15432 EVT DstTy = Op.getValueType();
15433 EVT TheVT = Op.getOperand(0).getValueType();
15434 auto PtrVT = getPointerTy(DAG.getDataLayout());
15436 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15437 // f16 must be promoted before using the lowering in this routine.
15438 // fp128 does not use this lowering.
15439 return std::make_pair(SDValue(), SDValue());
15442 // If using FIST to compute an unsigned i64, we'll need some fixup
15443 // to handle values above the maximum signed i64. A FIST is always
15444 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15445 bool UnsignedFixup = !IsSigned &&
15446 DstTy == MVT::i64 &&
15447 (!Subtarget.is64Bit() ||
15448 !isScalarFPTypeInSSEReg(TheVT));
15450 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15451 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15452 // The low 32 bits of the fist result will have the correct uint32 result.
15453 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15457 assert(DstTy.getSimpleVT() <= MVT::i64 &&
15458 DstTy.getSimpleVT() >= MVT::i16 &&
15459 "Unknown FP_TO_INT to lower!");
15461 // These are really Legal.
15462 if (DstTy == MVT::i32 &&
15463 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15464 return std::make_pair(SDValue(), SDValue());
15465 if (Subtarget.is64Bit() &&
15466 DstTy == MVT::i64 &&
15467 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15468 return std::make_pair(SDValue(), SDValue());
15470 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15472 MachineFunction &MF = DAG.getMachineFunction();
15473 unsigned MemSize = DstTy.getSizeInBits()/8;
15474 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15475 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15478 switch (DstTy.getSimpleVT().SimpleTy) {
15479 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15480 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15481 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15482 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15485 SDValue Chain = DAG.getEntryNode();
15486 SDValue Value = Op.getOperand(0);
15487 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15489 if (UnsignedFixup) {
15491 // Conversion to unsigned i64 is implemented with a select,
15492 // depending on whether the source value fits in the range
15493 // of a signed i64. Let Thresh be the FP equivalent of
15494 // 0x8000000000000000ULL.
15496 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15497 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15498 // Fist-to-mem64 FistSrc
15499 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15500 // to XOR'ing the high 32 bits with Adjust.
15502 // Being a power of 2, Thresh is exactly representable in all FP formats.
15503 // For X87 we'd like to use the smallest FP type for this constant, but
15504 // for DAG type consistency we have to match the FP operand type.
15506 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15507 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15508 bool LosesInfo = false;
15509 if (TheVT == MVT::f64)
15510 // The rounding mode is irrelevant as the conversion should be exact.
15511 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15513 else if (TheVT == MVT::f80)
15514 Status = Thresh.convert(APFloat::x87DoubleExtended(),
15515 APFloat::rmNearestTiesToEven, &LosesInfo);
15517 assert(Status == APFloat::opOK && !LosesInfo &&
15518 "FP conversion should have been exact");
15520 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15522 SDValue Cmp = DAG.getSetCC(DL,
15523 getSetCCResultType(DAG.getDataLayout(),
15524 *DAG.getContext(), TheVT),
15525 Value, ThreshVal, ISD::SETLT);
15526 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15527 DAG.getConstant(0, DL, MVT::i32),
15528 DAG.getConstant(0x80000000, DL, MVT::i32));
15529 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15530 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15531 *DAG.getContext(), TheVT),
15532 Value, ThreshVal, ISD::SETLT);
15533 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15536 // FIXME This causes a redundant load/store if the SSE-class value is already
15537 // in memory, such as if it is on the callstack.
15538 if (isScalarFPTypeInSSEReg(TheVT)) {
15539 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15540 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15541 MachinePointerInfo::getFixedStack(MF, SSFI));
15542 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15544 Chain, StackSlot, DAG.getValueType(TheVT)
15547 MachineMemOperand *MMO =
15548 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15549 MachineMemOperand::MOLoad, MemSize, MemSize);
15550 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15551 Chain = Value.getValue(1);
15552 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15553 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15556 MachineMemOperand *MMO =
15557 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15558 MachineMemOperand::MOStore, MemSize, MemSize);
15560 if (UnsignedFixup) {
15562 // Insert the FIST, load its result as two i32's,
15563 // and XOR the high i32 with Adjust.
15565 SDValue FistOps[] = { Chain, Value, StackSlot };
15566 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15567 FistOps, DstTy, MMO);
15570 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15571 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15574 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15575 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15577 if (Subtarget.is64Bit()) {
15578 // Join High32 and Low32 into a 64-bit result.
15579 // (High32 << 32) | Low32
15580 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15581 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15582 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15583 DAG.getConstant(32, DL, MVT::i8));
15584 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15585 return std::make_pair(Result, SDValue());
15588 SDValue ResultOps[] = { Low32, High32 };
15590 SDValue pair = IsReplace
15591 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15592 : DAG.getMergeValues(ResultOps, DL);
15593 return std::make_pair(pair, SDValue());
15595 // Build the FP_TO_INT*_IN_MEM
15596 SDValue Ops[] = { Chain, Value, StackSlot };
15597 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15599 return std::make_pair(FIST, StackSlot);
15603 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15604 const X86Subtarget &Subtarget) {
15605 MVT VT = Op->getSimpleValueType(0);
15606 SDValue In = Op->getOperand(0);
15607 MVT InVT = In.getSimpleValueType();
15610 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15611 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15613 // Optimize vectors in AVX mode:
15616 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15617 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15618 // Concat upper and lower parts.
15621 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15622 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15623 // Concat upper and lower parts.
15626 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15627 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15628 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15631 if (Subtarget.hasInt256())
15632 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15634 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15635 SDValue Undef = DAG.getUNDEF(InVT);
15636 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15637 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15638 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15640 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15641 VT.getVectorNumElements()/2);
15643 OpLo = DAG.getBitcast(HVT, OpLo);
15644 OpHi = DAG.getBitcast(HVT, OpHi);
15646 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15649 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15650 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15651 MVT VT = Op->getSimpleValueType(0);
15652 SDValue In = Op->getOperand(0);
15653 MVT InVT = In.getSimpleValueType();
15655 unsigned NumElts = VT.getVectorNumElements();
15657 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15658 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15659 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15661 if (InVT.getVectorElementType() != MVT::i1)
15664 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15666 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15667 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15670 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15672 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15674 SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
15676 return SelectedVal;
15677 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15680 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15681 SelectionDAG &DAG) {
15682 if (Subtarget.hasFp256())
15683 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15689 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15690 SelectionDAG &DAG) {
15692 MVT VT = Op.getSimpleValueType();
15693 SDValue In = Op.getOperand(0);
15694 MVT SVT = In.getSimpleValueType();
15696 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15697 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15699 if (Subtarget.hasFp256())
15700 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15703 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15704 VT.getVectorNumElements() != SVT.getVectorNumElements());
15708 /// Helper to recursively truncate vector elements in half with PACKSS.
15709 /// It makes use of the fact that vector comparison results will be all-zeros
15710 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15711 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15712 /// within each 128-bit lane.
15713 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15716 const X86Subtarget &Subtarget) {
15717 // Requires SSE2 but AVX512 has fast truncate.
15718 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15721 EVT SrcVT = In.getValueType();
15723 // No truncation required, we might get here due to recursive calls.
15724 if (SrcVT == DstVT)
15727 // We only support vector truncation to 128bits or greater from a
15728 // 256bits or greater source.
15729 if ((DstVT.getSizeInBits() % 128) != 0)
15731 if ((SrcVT.getSizeInBits() % 256) != 0)
15734 unsigned NumElems = SrcVT.getVectorNumElements();
15735 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15736 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15739 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15741 // Extract lower/upper subvectors.
15742 unsigned NumSubElts = NumElems / 2;
15743 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15744 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15745 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15747 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15748 if (SrcVT.is256BitVector()) {
15749 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15750 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15751 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15752 return DAG.getBitcast(DstVT, Res);
15755 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15756 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15757 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15758 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15759 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15760 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15762 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15763 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15764 Res = DAG.getBitcast(MVT::v4i64, Res);
15765 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15767 if (DstVT.is256BitVector())
15768 return DAG.getBitcast(DstVT, Res);
15770 // If 512bit -> 128bit truncate another stage.
15771 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15772 Res = DAG.getBitcast(PackedVT, Res);
15773 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15776 // Recursively pack lower/upper subvectors, concat result and pack again.
15777 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15778 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15779 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15780 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15782 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15783 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15784 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15787 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15788 const X86Subtarget &Subtarget) {
15791 MVT VT = Op.getSimpleValueType();
15792 SDValue In = Op.getOperand(0);
15793 MVT InVT = In.getSimpleValueType();
15795 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15797 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15798 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15799 if (InVT.getScalarSizeInBits() <= 16) {
15800 if (Subtarget.hasBWI()) {
15801 // legal, will go to VPMOVB2M, VPMOVW2M
15802 // Shift packed bytes not supported natively, bitcast to word
15803 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15804 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15805 DAG.getBitcast(ExtVT, In),
15806 DAG.getConstant(ShiftInx, DL, ExtVT));
15807 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15808 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15810 // Use TESTD/Q, extended vector to packed dword/qword.
15811 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15812 "Unexpected vector type.");
15813 unsigned NumElts = InVT.getVectorNumElements();
15814 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15815 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15817 ShiftInx = InVT.getScalarSizeInBits() - 1;
15820 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15821 DAG.getConstant(ShiftInx, DL, InVT));
15822 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15825 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15827 MVT VT = Op.getSimpleValueType();
15828 SDValue In = Op.getOperand(0);
15829 MVT InVT = In.getSimpleValueType();
15831 if (VT == MVT::i1) {
15832 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15833 "Invalid scalar TRUNCATE operation");
15834 if (InVT.getSizeInBits() >= 32)
15836 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15837 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15839 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15840 "Invalid TRUNCATE operation");
15842 if (VT.getVectorElementType() == MVT::i1)
15843 return LowerTruncateVecI1(Op, DAG, Subtarget);
15845 // vpmovqb/w/d, vpmovdb/w, vpmovwb
15846 if (Subtarget.hasAVX512()) {
15847 // word to byte only under BWI
15848 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15849 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15850 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
15851 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15854 // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
15855 if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
15856 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15859 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
15860 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15861 if (Subtarget.hasInt256()) {
15862 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15863 In = DAG.getBitcast(MVT::v8i32, In);
15864 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
15865 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15866 DAG.getIntPtrConstant(0, DL));
15869 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15870 DAG.getIntPtrConstant(0, DL));
15871 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15872 DAG.getIntPtrConstant(2, DL));
15873 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15874 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15875 static const int ShufMask[] = {0, 2, 4, 6};
15876 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
15879 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
15880 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
15881 if (Subtarget.hasInt256()) {
15882 In = DAG.getBitcast(MVT::v32i8, In);
15884 // The PSHUFB mask:
15885 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
15886 -1, -1, -1, -1, -1, -1, -1, -1,
15887 16, 17, 20, 21, 24, 25, 28, 29,
15888 -1, -1, -1, -1, -1, -1, -1, -1 };
15889 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
15890 In = DAG.getBitcast(MVT::v4i64, In);
15892 static const int ShufMask2[] = {0, 2, -1, -1};
15893 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
15894 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15895 DAG.getIntPtrConstant(0, DL));
15896 return DAG.getBitcast(VT, In);
15899 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15900 DAG.getIntPtrConstant(0, DL));
15902 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15903 DAG.getIntPtrConstant(4, DL));
15905 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
15906 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
15908 // The PSHUFB mask:
15909 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
15910 -1, -1, -1, -1, -1, -1, -1, -1};
15912 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
15913 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
15915 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15916 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15918 // The MOVLHPS Mask:
15919 static const int ShufMask2[] = {0, 1, 4, 5};
15920 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15921 return DAG.getBitcast(MVT::v8i16, res);
15924 // Handle truncation of V256 to V128 using shuffles.
15925 if (!VT.is128BitVector() || !InVT.is256BitVector())
15928 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
15930 unsigned NumElems = VT.getVectorNumElements();
15931 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15933 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15934 // Prepare truncation shuffle mask
15935 for (unsigned i = 0; i != NumElems; ++i)
15936 MaskVec[i] = i * 2;
15937 In = DAG.getBitcast(NVT, In);
15938 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
15939 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15940 DAG.getIntPtrConstant(0, DL));
15943 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
15944 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
15945 MVT VT = Op.getSimpleValueType();
15947 if (VT.isVector()) {
15948 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
15949 SDValue Src = Op.getOperand(0);
15951 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
15952 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
15953 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
15954 DAG.getUNDEF(MVT::v2f32)));
15960 assert(!VT.isVector());
15962 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15963 IsSigned, /*IsReplace=*/ false);
15964 SDValue FIST = Vals.first, StackSlot = Vals.second;
15965 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
15966 if (!FIST.getNode())
15969 if (StackSlot.getNode())
15970 // Load the result.
15971 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
15973 // The node is the result.
15977 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
15979 MVT VT = Op.getSimpleValueType();
15980 SDValue In = Op.getOperand(0);
15981 MVT SVT = In.getSimpleValueType();
15983 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
15985 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
15986 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
15987 In, DAG.getUNDEF(SVT)));
15990 /// The only differences between FABS and FNEG are the mask and the logic op.
15991 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
15992 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
15993 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
15994 "Wrong opcode for lowering FABS or FNEG.");
15996 bool IsFABS = (Op.getOpcode() == ISD::FABS);
15998 // If this is a FABS and it has an FNEG user, bail out to fold the combination
15999 // into an FNABS. We'll lower the FABS after that if it is still in use.
16001 for (SDNode *User : Op->uses())
16002 if (User->getOpcode() == ISD::FNEG)
16006 MVT VT = Op.getSimpleValueType();
16008 bool IsF128 = (VT == MVT::f128);
16010 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16011 // decide if we should generate a 16-byte constant mask when we only need 4 or
16012 // 8 bytes for the scalar case.
16017 if (VT.isVector()) {
16019 EltVT = VT.getVectorElementType();
16020 } else if (IsF128) {
16021 // SSE instructions are used for optimized f128 logical operations.
16022 LogicVT = MVT::f128;
16025 // There are no scalar bitwise logical SSE/AVX instructions, so we
16026 // generate a 16-byte vector constant and logic op even for the scalar case.
16027 // Using a 16-byte mask allows folding the load of the mask with
16028 // the logic op, so it can save (~4 bytes) on code size.
16029 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16033 unsigned EltBits = EltVT.getSizeInBits();
16034 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16036 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16037 const fltSemantics &Sem =
16038 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16039 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16040 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16042 SDValue Op0 = Op.getOperand(0);
16043 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16045 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16046 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16048 if (VT.isVector() || IsF128)
16049 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16051 // For the scalar case extend to a 128-bit vector, perform the logic op,
16052 // and extract the scalar result back out.
16053 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16054 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16055 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16056 DAG.getIntPtrConstant(0, dl));
16059 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16060 SDValue Mag = Op.getOperand(0);
16061 SDValue Sign = Op.getOperand(1);
16064 // If the sign operand is smaller, extend it first.
16065 MVT VT = Op.getSimpleValueType();
16066 if (Sign.getSimpleValueType().bitsLT(VT))
16067 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16069 // And if it is bigger, shrink it first.
16070 if (Sign.getSimpleValueType().bitsGT(VT))
16071 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16073 // At this point the operands and the result should have the same
16074 // type, and that won't be f80 since that is not custom lowered.
16075 bool IsF128 = (VT == MVT::f128);
16076 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16077 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16078 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16079 "Unexpected type in LowerFCOPYSIGN");
16081 MVT EltVT = VT.getScalarType();
16082 const fltSemantics &Sem =
16083 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16084 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16086 // Perform all scalar logic operations as 16-byte vectors because there are no
16087 // scalar FP logic instructions in SSE.
16088 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16089 // unnecessary splats, but we might miss load folding opportunities. Should
16090 // this decision be based on OptimizeForSize?
16091 bool IsFakeVector = !VT.isVector() && !IsF128;
16094 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16096 // The mask constants are automatically splatted for vector types.
16097 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16098 SDValue SignMask = DAG.getConstantFP(
16099 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16100 SDValue MagMask = DAG.getConstantFP(
16101 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16103 // First, clear all bits but the sign bit from the second operand (sign).
16105 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16106 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16108 // Next, clear the sign bit from the first operand (magnitude).
16109 // TODO: If we had general constant folding for FP logic ops, this check
16110 // wouldn't be necessary.
16112 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16113 APFloat APF = Op0CN->getValueAPF();
16115 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16117 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16119 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16120 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16123 // OR the magnitude value with the sign bit.
16124 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16125 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16126 DAG.getIntPtrConstant(0, dl));
16129 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16130 SDValue N0 = Op.getOperand(0);
16132 MVT VT = Op.getSimpleValueType();
16134 MVT OpVT = N0.getSimpleValueType();
16135 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16136 "Unexpected type for FGETSIGN");
16138 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16139 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16140 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16141 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16142 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16143 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16147 // Check whether an OR'd tree is PTEST-able.
16148 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16149 SelectionDAG &DAG) {
16150 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16152 if (!Subtarget.hasSSE41())
16155 if (!Op->hasOneUse())
16158 SDNode *N = Op.getNode();
16161 SmallVector<SDValue, 8> Opnds;
16162 DenseMap<SDValue, unsigned> VecInMap;
16163 SmallVector<SDValue, 8> VecIns;
16164 EVT VT = MVT::Other;
16166 // Recognize a special case where a vector is casted into wide integer to
16168 Opnds.push_back(N->getOperand(0));
16169 Opnds.push_back(N->getOperand(1));
16171 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16172 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16173 // BFS traverse all OR'd operands.
16174 if (I->getOpcode() == ISD::OR) {
16175 Opnds.push_back(I->getOperand(0));
16176 Opnds.push_back(I->getOperand(1));
16177 // Re-evaluate the number of nodes to be traversed.
16178 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16182 // Quit if a non-EXTRACT_VECTOR_ELT
16183 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16186 // Quit if without a constant index.
16187 SDValue Idx = I->getOperand(1);
16188 if (!isa<ConstantSDNode>(Idx))
16191 SDValue ExtractedFromVec = I->getOperand(0);
16192 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16193 if (M == VecInMap.end()) {
16194 VT = ExtractedFromVec.getValueType();
16195 // Quit if not 128/256-bit vector.
16196 if (!VT.is128BitVector() && !VT.is256BitVector())
16198 // Quit if not the same type.
16199 if (VecInMap.begin() != VecInMap.end() &&
16200 VT != VecInMap.begin()->first.getValueType())
16202 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16203 VecIns.push_back(ExtractedFromVec);
16205 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16208 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16209 "Not extracted from 128-/256-bit vector.");
16211 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16213 for (DenseMap<SDValue, unsigned>::const_iterator
16214 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16215 // Quit if not all elements are used.
16216 if (I->second != FullMask)
16220 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16222 // Cast all vectors into TestVT for PTEST.
16223 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16224 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16226 // If more than one full vector is evaluated, OR them first before PTEST.
16227 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16228 // Each iteration will OR 2 nodes and append the result until there is only
16229 // 1 node left, i.e. the final OR'd value of all vectors.
16230 SDValue LHS = VecIns[Slot];
16231 SDValue RHS = VecIns[Slot + 1];
16232 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16235 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16238 /// \brief return true if \c Op has a use that doesn't just read flags.
16239 static bool hasNonFlagsUse(SDValue Op) {
16240 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16242 SDNode *User = *UI;
16243 unsigned UOpNo = UI.getOperandNo();
16244 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16245 // Look pass truncate.
16246 UOpNo = User->use_begin().getOperandNo();
16247 User = *User->use_begin();
16250 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16251 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16257 // Emit KTEST instruction for bit vectors on AVX-512
16258 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16259 const X86Subtarget &Subtarget) {
16260 if (Op.getOpcode() == ISD::BITCAST) {
16261 auto hasKTEST = [&](MVT VT) {
16262 unsigned SizeInBits = VT.getSizeInBits();
16263 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16264 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16266 SDValue Op0 = Op.getOperand(0);
16267 MVT Op0VT = Op0.getValueType().getSimpleVT();
16268 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16270 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16275 /// Emit nodes that will be selected as "test Op0,Op0", or something
16277 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16278 SelectionDAG &DAG) const {
16279 if (Op.getValueType() == MVT::i1) {
16280 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16281 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16282 DAG.getConstant(0, dl, MVT::i8));
16284 // CF and OF aren't always set the way we want. Determine which
16285 // of these we need.
16286 bool NeedCF = false;
16287 bool NeedOF = false;
16290 case X86::COND_A: case X86::COND_AE:
16291 case X86::COND_B: case X86::COND_BE:
16294 case X86::COND_G: case X86::COND_GE:
16295 case X86::COND_L: case X86::COND_LE:
16296 case X86::COND_O: case X86::COND_NO: {
16297 // Check if we really need to set the
16298 // Overflow flag. If NoSignedWrap is present
16299 // that is not actually needed.
16300 switch (Op->getOpcode()) {
16305 if (Op.getNode()->getFlags().hasNoSignedWrap())
16314 // See if we can use the EFLAGS value from the operand instead of
16315 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16316 // we prove that the arithmetic won't overflow, we can't use OF or CF.
16317 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16318 // Emit KTEST for bit vectors
16319 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16321 // Emit a CMP with 0, which is the TEST pattern.
16322 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16323 DAG.getConstant(0, dl, Op.getValueType()));
16325 unsigned Opcode = 0;
16326 unsigned NumOperands = 0;
16328 // Truncate operations may prevent the merge of the SETCC instruction
16329 // and the arithmetic instruction before it. Attempt to truncate the operands
16330 // of the arithmetic instruction and use a reduced bit-width instruction.
16331 bool NeedTruncation = false;
16332 SDValue ArithOp = Op;
16333 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16334 SDValue Arith = Op->getOperand(0);
16335 // Both the trunc and the arithmetic op need to have one user each.
16336 if (Arith->hasOneUse())
16337 switch (Arith.getOpcode()) {
16344 NeedTruncation = true;
16350 // Sometimes flags can be set either with an AND or with an SRL/SHL
16351 // instruction. SRL/SHL variant should be preferred for masks longer than this
16353 const int ShiftToAndMaxMaskWidth = 32;
16354 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16356 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16357 // which may be the result of a CAST. We use the variable 'Op', which is the
16358 // non-casted variable when we check for possible users.
16359 switch (ArithOp.getOpcode()) {
16361 // Due to an isel shortcoming, be conservative if this add is likely to be
16362 // selected as part of a load-modify-store instruction. When the root node
16363 // in a match is a store, isel doesn't know how to remap non-chain non-flag
16364 // uses of other nodes in the match, such as the ADD in this case. This
16365 // leads to the ADD being left around and reselected, with the result being
16366 // two adds in the output. Alas, even if none our users are stores, that
16367 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
16368 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
16369 // climbing the DAG back to the root, and it doesn't seem to be worth the
16371 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16372 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16373 if (UI->getOpcode() != ISD::CopyToReg &&
16374 UI->getOpcode() != ISD::SETCC &&
16375 UI->getOpcode() != ISD::STORE)
16378 if (ConstantSDNode *C =
16379 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16380 // An add of one will be selected as an INC.
16381 if (C->isOne() && !Subtarget.slowIncDec()) {
16382 Opcode = X86ISD::INC;
16387 // An add of negative one (subtract of one) will be selected as a DEC.
16388 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16389 Opcode = X86ISD::DEC;
16395 // Otherwise use a regular EFLAGS-setting add.
16396 Opcode = X86ISD::ADD;
16401 // If we have a constant logical shift that's only used in a comparison
16402 // against zero turn it into an equivalent AND. This allows turning it into
16403 // a TEST instruction later.
16404 if (ZeroCheck && Op->hasOneUse() &&
16405 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16406 EVT VT = Op.getValueType();
16407 unsigned BitWidth = VT.getSizeInBits();
16408 unsigned ShAmt = Op->getConstantOperandVal(1);
16409 if (ShAmt >= BitWidth) // Avoid undefined shifts.
16411 APInt Mask = ArithOp.getOpcode() == ISD::SRL
16412 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16413 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16414 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16416 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16417 DAG.getConstant(Mask, dl, VT));
16422 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16423 // because a TEST instruction will be better. However, AND should be
16424 // preferred if the instruction can be combined into ANDN.
16425 if (!hasNonFlagsUse(Op)) {
16426 SDValue Op0 = ArithOp->getOperand(0);
16427 SDValue Op1 = ArithOp->getOperand(1);
16428 EVT VT = ArithOp.getValueType();
16429 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16430 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16431 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16433 // If we cannot select an ANDN instruction, check if we can replace
16434 // AND+IMM64 with a shift before giving up. This is possible for masks
16435 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16436 if (!isProperAndn) {
16440 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16441 auto *CN = dyn_cast<ConstantSDNode>(Op1);
16445 const APInt &Mask = CN->getAPIntValue();
16446 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16447 break; // Prefer TEST instruction.
16449 unsigned BitWidth = Mask.getBitWidth();
16450 unsigned LeadingOnes = Mask.countLeadingOnes();
16451 unsigned TrailingZeros = Mask.countTrailingZeros();
16453 if (LeadingOnes + TrailingZeros == BitWidth) {
16454 assert(TrailingZeros < VT.getSizeInBits() &&
16455 "Shift amount should be less than the type width");
16456 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16457 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16458 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16462 unsigned LeadingZeros = Mask.countLeadingZeros();
16463 unsigned TrailingOnes = Mask.countTrailingOnes();
16465 if (LeadingZeros + TrailingOnes == BitWidth) {
16466 assert(LeadingZeros < VT.getSizeInBits() &&
16467 "Shift amount should be less than the type width");
16468 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16469 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16470 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16481 // Due to the ISEL shortcoming noted above, be conservative if this op is
16482 // likely to be selected as part of a load-modify-store instruction.
16483 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16484 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16485 if (UI->getOpcode() == ISD::STORE)
16488 // Otherwise use a regular EFLAGS-setting instruction.
16489 switch (ArithOp.getOpcode()) {
16490 default: llvm_unreachable("unexpected operator!");
16491 case ISD::SUB: Opcode = X86ISD::SUB; break;
16492 case ISD::XOR: Opcode = X86ISD::XOR; break;
16493 case ISD::AND: Opcode = X86ISD::AND; break;
16495 if (!NeedTruncation && ZeroCheck) {
16496 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16499 Opcode = X86ISD::OR;
16513 return SDValue(Op.getNode(), 1);
16519 // If we found that truncation is beneficial, perform the truncation and
16521 if (NeedTruncation) {
16522 EVT VT = Op.getValueType();
16523 SDValue WideVal = Op->getOperand(0);
16524 EVT WideVT = WideVal.getValueType();
16525 unsigned ConvertedOp = 0;
16526 // Use a target machine opcode to prevent further DAGCombine
16527 // optimizations that may separate the arithmetic operations
16528 // from the setcc node.
16529 switch (WideVal.getOpcode()) {
16531 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16532 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16533 case ISD::AND: ConvertedOp = X86ISD::AND; break;
16534 case ISD::OR: ConvertedOp = X86ISD::OR; break;
16535 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16539 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16540 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16541 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16542 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16543 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16549 // Emit KTEST for bit vectors
16550 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16553 // Emit a CMP with 0, which is the TEST pattern.
16554 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16555 DAG.getConstant(0, dl, Op.getValueType()));
16557 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16558 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16560 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16561 DAG.ReplaceAllUsesWith(Op, New);
16562 return SDValue(New.getNode(), 1);
16565 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16567 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16568 const SDLoc &dl, SelectionDAG &DAG) const {
16569 if (isNullConstant(Op1))
16570 return EmitTest(Op0, X86CC, dl, DAG);
16572 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16573 "Unexpected comparison operation for MVT::i1 operands");
16575 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16576 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16577 // Only promote the compare up to I32 if it is a 16 bit operation
16578 // with an immediate. 16 bit immediates are to be avoided.
16579 if ((Op0.getValueType() == MVT::i16 &&
16580 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16581 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16582 !Subtarget.isAtom()) {
16583 unsigned ExtendOp =
16584 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16585 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16586 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16588 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16589 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16590 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16592 return SDValue(Sub.getNode(), 1);
16594 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16597 /// Convert a comparison if required by the subtarget.
16598 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16599 SelectionDAG &DAG) const {
16600 // If the subtarget does not support the FUCOMI instruction, floating-point
16601 // comparisons have to be converted.
16602 if (Subtarget.hasCMov() ||
16603 Cmp.getOpcode() != X86ISD::CMP ||
16604 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16605 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16608 // The instruction selector will select an FUCOM instruction instead of
16609 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16610 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16611 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16613 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16614 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16615 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16616 DAG.getConstant(8, dl, MVT::i8));
16617 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16619 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16620 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16621 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16624 /// Check if replacement of SQRT with RSQRT should be disabled.
16625 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16626 EVT VT = Op.getValueType();
16628 // We never want to use both SQRT and RSQRT instructions for the same input.
16629 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16633 return Subtarget.hasFastVectorFSQRT();
16634 return Subtarget.hasFastScalarFSQRT();
16637 /// The minimum architected relative accuracy is 2^-12. We need one
16638 /// Newton-Raphson step to have a good float result (24 bits of precision).
16639 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16640 SelectionDAG &DAG, int Enabled,
16641 int &RefinementSteps,
16642 bool &UseOneConstNR,
16643 bool Reciprocal) const {
16644 EVT VT = Op.getValueType();
16646 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16647 // TODO: Add support for AVX512 (v16f32).
16648 // It is likely not profitable to do this for f64 because a double-precision
16649 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16650 // instructions: convert to single, rsqrtss, convert back to double, refine
16651 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16652 // along with FMA, this could be a throughput win.
16653 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16654 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16655 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16656 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16657 RefinementSteps = 1;
16659 UseOneConstNR = false;
16660 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16665 /// The minimum architected relative accuracy is 2^-12. We need one
16666 /// Newton-Raphson step to have a good float result (24 bits of precision).
16667 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16669 int &RefinementSteps) const {
16670 EVT VT = Op.getValueType();
16672 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16673 // TODO: Add support for AVX512 (v16f32).
16674 // It is likely not profitable to do this for f64 because a double-precision
16675 // reciprocal estimate with refinement on x86 prior to FMA requires
16676 // 15 instructions: convert to single, rcpss, convert back to double, refine
16677 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16678 // along with FMA, this could be a throughput win.
16680 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16681 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16682 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16683 // Enable estimate codegen with 1 refinement step for vector division.
16684 // Scalar division estimates are disabled because they break too much
16685 // real-world code. These defaults are intended to match GCC behavior.
16686 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16689 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16690 RefinementSteps = 1;
16692 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16697 /// If we have at least two divisions that use the same divisor, convert to
16698 /// multiplication by a reciprocal. This may need to be adjusted for a given
16699 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16700 /// This is because we still need one division to calculate the reciprocal and
16701 /// then we need two multiplies by that reciprocal as replacements for the
16702 /// original divisions.
16703 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16707 /// Helper for creating a X86ISD::SETCC node.
16708 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16709 SelectionDAG &DAG) {
16710 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16711 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16714 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16715 /// according to equal/not-equal condition code \p CC.
16716 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16717 const SDLoc &dl, SelectionDAG &DAG) {
16718 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16719 // instruction. Since the shift amount is in-range-or-undefined, we know
16720 // that doing a bittest on the i32 value is ok. We extend to i32 because
16721 // the encoding for the i16 version is larger than the i32 version.
16722 // Also promote i16 to i32 for performance / code size reason.
16723 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16724 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16726 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16727 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16728 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16729 // known to be zero.
16730 if (Src.getValueType() == MVT::i64 &&
16731 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16732 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16734 // If the operand types disagree, extend the shift amount to match. Since
16735 // BT ignores high bits (like shifts) we can use anyextend.
16736 if (Src.getValueType() != BitNo.getValueType())
16737 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16739 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16740 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16741 return getSETCC(Cond, BT, dl , DAG);
16744 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16745 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16746 const SDLoc &dl, SelectionDAG &DAG) {
16747 SDValue Op0 = And.getOperand(0);
16748 SDValue Op1 = And.getOperand(1);
16749 if (Op0.getOpcode() == ISD::TRUNCATE)
16750 Op0 = Op0.getOperand(0);
16751 if (Op1.getOpcode() == ISD::TRUNCATE)
16752 Op1 = Op1.getOperand(0);
16755 if (Op1.getOpcode() == ISD::SHL)
16756 std::swap(Op0, Op1);
16757 if (Op0.getOpcode() == ISD::SHL) {
16758 if (isOneConstant(Op0.getOperand(0))) {
16759 // If we looked past a truncate, check that it's only truncating away
16761 unsigned BitWidth = Op0.getValueSizeInBits();
16762 unsigned AndBitWidth = And.getValueSizeInBits();
16763 if (BitWidth > AndBitWidth) {
16765 DAG.computeKnownBits(Op0, Known);
16766 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
16770 RHS = Op0.getOperand(1);
16772 } else if (Op1.getOpcode() == ISD::Constant) {
16773 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16774 uint64_t AndRHSVal = AndRHS->getZExtValue();
16775 SDValue AndLHS = Op0;
16777 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16778 LHS = AndLHS.getOperand(0);
16779 RHS = AndLHS.getOperand(1);
16782 // Use BT if the immediate can't be encoded in a TEST instruction.
16783 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16785 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16790 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16795 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16796 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16797 const SDLoc &dl, SelectionDAG &DAG) {
16799 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16800 "Expected TRUNCATE to i1 node");
16802 if (Op.getOperand(0).getOpcode() != ISD::SRL)
16805 SDValue ShiftRight = Op.getOperand(0);
16806 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16810 /// Result of 'and' or 'trunc to i1' is compared against zero.
16811 /// Change to a BT node if possible.
16812 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16813 const SDLoc &dl, SelectionDAG &DAG) const {
16814 if (Op.getOpcode() == ISD::AND)
16815 return LowerAndToBT(Op, CC, dl, DAG);
16816 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16817 return LowerTruncateToBT(Op, CC, dl, DAG);
16821 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16823 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16828 // SSE Condition code mapping:
16837 switch (SetCCOpcode) {
16838 default: llvm_unreachable("Unexpected SETCC condition");
16840 case ISD::SETEQ: SSECC = 0; break;
16842 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
16844 case ISD::SETOLT: SSECC = 1; break;
16846 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
16848 case ISD::SETOLE: SSECC = 2; break;
16849 case ISD::SETUO: SSECC = 3; break;
16851 case ISD::SETNE: SSECC = 4; break;
16852 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
16853 case ISD::SETUGE: SSECC = 5; break;
16854 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
16855 case ISD::SETUGT: SSECC = 6; break;
16856 case ISD::SETO: SSECC = 7; break;
16858 case ISD::SETONE: SSECC = 8; break;
16861 std::swap(Op0, Op1);
16866 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16867 /// concatenate the result back.
16868 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16869 MVT VT = Op.getSimpleValueType();
16871 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16872 "Unsupported value type for operation");
16874 unsigned NumElems = VT.getVectorNumElements();
16876 SDValue CC = Op.getOperand(2);
16878 // Extract the LHS vectors
16879 SDValue LHS = Op.getOperand(0);
16880 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16881 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16883 // Extract the RHS vectors
16884 SDValue RHS = Op.getOperand(1);
16885 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16886 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16888 // Issue the operation on the smaller types and concatenate the result back
16889 MVT EltVT = VT.getVectorElementType();
16890 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16891 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16892 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16893 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16896 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16897 SDValue Op0 = Op.getOperand(0);
16898 SDValue Op1 = Op.getOperand(1);
16899 SDValue CC = Op.getOperand(2);
16900 MVT VT = Op.getSimpleValueType();
16903 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16904 "Unexpected type for boolean compare operation");
16905 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16906 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
16907 DAG.getConstant(-1, dl, VT));
16908 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
16909 DAG.getConstant(-1, dl, VT));
16910 switch (SetCCOpcode) {
16911 default: llvm_unreachable("Unexpected SETCC condition");
16913 // (x == y) -> ~(x ^ y)
16914 return DAG.getNode(ISD::XOR, dl, VT,
16915 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
16916 DAG.getConstant(-1, dl, VT));
16918 // (x != y) -> (x ^ y)
16919 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
16922 // (x > y) -> (x & ~y)
16923 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
16926 // (x < y) -> (~x & y)
16927 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
16930 // (x <= y) -> (~x | y)
16931 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
16934 // (x >=y) -> (x | ~y)
16935 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
16939 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16941 SDValue Op0 = Op.getOperand(0);
16942 SDValue Op1 = Op.getOperand(1);
16943 SDValue CC = Op.getOperand(2);
16944 MVT VT = Op.getSimpleValueType();
16947 assert(VT.getVectorElementType() == MVT::i1 &&
16948 "Cannot set masked compare for this operation");
16950 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16952 bool Unsigned = false;
16955 switch (SetCCOpcode) {
16956 default: llvm_unreachable("Unexpected SETCC condition");
16957 case ISD::SETNE: SSECC = 4; break;
16958 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
16959 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
16960 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
16961 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
16962 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
16963 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
16964 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
16965 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
16966 case ISD::SETLE: SSECC = 2; break;
16970 std::swap(Op0, Op1);
16972 return DAG.getNode(Opc, dl, VT, Op0, Op1);
16973 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
16974 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16975 DAG.getConstant(SSECC, dl, MVT::i8));
16978 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
16979 /// operand \p Op1. If non-trivial (for example because it's not constant)
16980 /// return an empty value.
16981 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
16982 SelectionDAG &DAG) {
16983 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
16987 MVT VT = Op1.getSimpleValueType();
16988 MVT EVT = VT.getVectorElementType();
16989 unsigned n = VT.getVectorNumElements();
16990 SmallVector<SDValue, 8> ULTOp1;
16992 for (unsigned i = 0; i < n; ++i) {
16993 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
16994 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
16997 // Avoid underflow.
16998 APInt Val = Elt->getAPIntValue();
17002 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17005 return DAG.getBuildVector(VT, dl, ULTOp1);
17008 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17009 SelectionDAG &DAG) {
17010 SDValue Op0 = Op.getOperand(0);
17011 SDValue Op1 = Op.getOperand(1);
17012 SDValue CC = Op.getOperand(2);
17013 MVT VT = Op.getSimpleValueType();
17014 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17015 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17020 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17021 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17025 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17026 assert(VT.getVectorNumElements() <= 16);
17027 Opc = X86ISD::CMPM;
17029 Opc = X86ISD::CMPP;
17030 // The SSE/AVX packed FP comparison nodes are defined with a
17031 // floating-point vector result that matches the operand type. This allows
17032 // them to work with an SSE1 target (integer vector types are not legal).
17033 VT = Op0.getSimpleValueType();
17036 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17037 // emit two comparisons and a logic op to tie them together.
17038 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17041 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
17043 // LLVM predicate is SETUEQ or SETONE.
17045 unsigned CombineOpc;
17046 if (SetCCOpcode == ISD::SETUEQ) {
17049 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17050 static_cast<unsigned>(ISD::OR);
17052 assert(SetCCOpcode == ISD::SETONE);
17055 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17056 static_cast<unsigned>(ISD::AND);
17059 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17060 DAG.getConstant(CC0, dl, MVT::i8));
17061 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17062 DAG.getConstant(CC1, dl, MVT::i8));
17063 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17065 // Handle all other FP comparisons here.
17066 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17067 DAG.getConstant(SSECC, dl, MVT::i8));
17070 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17071 // result type of SETCC. The bitcast is expected to be optimized away
17072 // during combining/isel.
17073 if (Opc == X86ISD::CMPP)
17074 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17079 MVT VTOp0 = Op0.getSimpleValueType();
17080 assert(VTOp0 == Op1.getSimpleValueType() &&
17081 "Expected operands with same type!");
17082 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17083 "Invalid number of packed elements for source and destination!");
17085 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17086 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17087 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17088 // legalizer firstly checks if the first operand in input to the setcc has
17089 // a legal type. If so, then it promotes the return type to that same type.
17090 // Otherwise, the return type is promoted to the 'next legal type' which,
17091 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17093 // We reach this code only if the following two conditions are met:
17094 // 1. Both return type and operand type have been promoted to wider types
17095 // by the type legalizer.
17096 // 2. The original operand type has been promoted to a 256-bit vector.
17098 // Note that condition 2. only applies for AVX targets.
17099 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
17100 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17103 // The non-AVX512 code below works under the assumption that source and
17104 // destination types are the same.
17105 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17106 "Value types for source and destination must be the same!");
17108 // Break 256-bit integer vector compare into smaller ones.
17109 if (VT.is256BitVector() && !Subtarget.hasInt256())
17110 return Lower256IntVSETCC(Op, DAG);
17112 // Operands are boolean (vectors of i1)
17113 MVT OpVT = Op1.getSimpleValueType();
17114 if (OpVT.getVectorElementType() == MVT::i1)
17115 return LowerBoolVSETCC_AVX512(Op, DAG);
17117 // The result is boolean, but operands are int/float
17118 if (VT.getVectorElementType() == MVT::i1) {
17119 // In AVX-512 architecture setcc returns mask with i1 elements,
17120 // But there is no compare instruction for i8 and i16 elements in KNL.
17121 // In this case use SSE compare
17122 bool UseAVX512Inst =
17123 (OpVT.is512BitVector() ||
17124 OpVT.getScalarSizeInBits() >= 32 ||
17125 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17128 return LowerIntVSETCC_AVX512(Op, DAG);
17130 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17131 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17134 // Lower using XOP integer comparisons.
17135 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17136 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17137 // Translate compare code to XOP PCOM compare mode.
17138 unsigned CmpMode = 0;
17139 switch (SetCCOpcode) {
17140 default: llvm_unreachable("Unexpected SETCC condition");
17142 case ISD::SETLT: CmpMode = 0x00; break;
17144 case ISD::SETLE: CmpMode = 0x01; break;
17146 case ISD::SETGT: CmpMode = 0x02; break;
17148 case ISD::SETGE: CmpMode = 0x03; break;
17149 case ISD::SETEQ: CmpMode = 0x04; break;
17150 case ISD::SETNE: CmpMode = 0x05; break;
17153 // Are we comparing unsigned or signed integers?
17154 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
17155 ? X86ISD::VPCOMU : X86ISD::VPCOM;
17157 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17158 DAG.getConstant(CmpMode, dl, MVT::i8));
17161 // We are handling one of the integer comparisons here. Since SSE only has
17162 // GT and EQ comparisons for integer, swapping operands and multiple
17163 // operations may be required for some comparisons.
17165 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
17166 bool Subus = false;
17168 switch (SetCCOpcode) {
17169 default: llvm_unreachable("Unexpected SETCC condition");
17170 case ISD::SETNE: Invert = true;
17171 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
17172 case ISD::SETLT: Swap = true;
17173 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
17174 case ISD::SETGE: Swap = true;
17175 case ISD::SETLE: Opc = X86ISD::PCMPGT;
17176 Invert = true; break;
17177 case ISD::SETULT: Swap = true;
17178 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
17179 FlipSigns = true; break;
17180 case ISD::SETUGE: Swap = true;
17181 case ISD::SETULE: Opc = X86ISD::PCMPGT;
17182 FlipSigns = true; Invert = true; break;
17185 // Special case: Use min/max operations for SETULE/SETUGE
17186 MVT VET = VT.getVectorElementType();
17188 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
17189 || (Subtarget.hasSSE2() && (VET == MVT::i8));
17192 switch (SetCCOpcode) {
17194 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17195 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17198 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
17201 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17202 if (!MinMax && hasSubus) {
17203 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17205 // t = psubus Op0, Op1
17206 // pcmpeq t, <0..0>
17207 switch (SetCCOpcode) {
17209 case ISD::SETULT: {
17210 // If the comparison is against a constant we can turn this into a
17211 // setule. With psubus, setule does not require a swap. This is
17212 // beneficial because the constant in the register is no longer
17213 // destructed as the destination so it can be hoisted out of a loop.
17214 // Only do this pre-AVX since vpcmp* is no longer destructive.
17215 if (Subtarget.hasAVX())
17217 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17219 Subus = true; Invert = false; Swap = false;
17223 // Psubus is better than flip-sign because it requires no inversion.
17224 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17225 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17229 Opc = X86ISD::SUBUS;
17235 std::swap(Op0, Op1);
17237 // Check that the operation in question is available (most are plain SSE2,
17238 // but PCMPGTQ and PCMPEQQ have different requirements).
17239 if (VT == MVT::v2i64) {
17240 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17241 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17243 // First cast everything to the right type.
17244 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17245 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17247 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17248 // bits of the inputs before performing those operations. The lower
17249 // compare is always unsigned.
17252 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17254 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17255 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17256 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17258 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17259 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17261 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17262 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17263 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17265 // Create masks for only the low parts/high parts of the 64 bit integers.
17266 static const int MaskHi[] = { 1, 1, 3, 3 };
17267 static const int MaskLo[] = { 0, 0, 2, 2 };
17268 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17269 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17270 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17272 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17273 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17276 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17278 return DAG.getBitcast(VT, Result);
17281 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17282 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17283 // pcmpeqd + pshufd + pand.
17284 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17286 // First cast everything to the right type.
17287 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17288 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17291 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17293 // Make sure the lower and upper halves are both all-ones.
17294 static const int Mask[] = { 1, 0, 3, 2 };
17295 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17296 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17299 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17301 return DAG.getBitcast(VT, Result);
17305 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17306 // bits of the inputs before performing those operations.
17308 MVT EltVT = VT.getVectorElementType();
17309 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17311 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17312 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17315 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17317 // If the logical-not of the result is required, perform that now.
17319 Result = DAG.getNOT(dl, Result, VT);
17322 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17325 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17326 getZeroVector(VT, Subtarget, DAG, dl));
17331 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17333 MVT VT = Op.getSimpleValueType();
17335 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17337 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
17338 SDValue Op0 = Op.getOperand(0);
17339 SDValue Op1 = Op.getOperand(1);
17341 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17343 // Optimize to BT if possible.
17344 // Lower (X & (1 << N)) == 0 to BT(X, N).
17345 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17346 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17347 // Lower (trunc (X >> N) to i1) to BT(X, N).
17348 if (Op0.hasOneUse() && isNullConstant(Op1) &&
17349 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17350 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17352 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17357 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17359 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17360 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17362 // If the input is a setcc, then reuse the input setcc or use a new one with
17363 // the inverted condition.
17364 if (Op0.getOpcode() == X86ISD::SETCC) {
17365 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17366 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17370 CCode = X86::GetOppositeBranchCondition(CCode);
17371 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17373 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17377 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17378 if (isOneConstant(Op1)) {
17379 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17380 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17382 if (!isNullConstant(Op1)) {
17383 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17384 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17388 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17389 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17390 if (X86CC == X86::COND_INVALID)
17393 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17394 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17395 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17397 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17401 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
17402 SDValue LHS = Op.getOperand(0);
17403 SDValue RHS = Op.getOperand(1);
17404 SDValue Carry = Op.getOperand(2);
17405 SDValue Cond = Op.getOperand(3);
17408 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
17409 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17411 assert(Carry.getOpcode() != ISD::CARRY_FALSE);
17412 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17413 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
17414 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17415 if (Op.getSimpleValueType() == MVT::i1)
17416 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17420 /// Return true if opcode is a X86 logical comparison.
17421 static bool isX86LogicalCmp(SDValue Op) {
17422 unsigned Opc = Op.getOpcode();
17423 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17424 Opc == X86ISD::SAHF)
17426 if (Op.getResNo() == 1 &&
17427 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17428 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17429 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17430 Opc == X86ISD::XOR || Opc == X86ISD::AND))
17433 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17439 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17440 if (V.getOpcode() != ISD::TRUNCATE)
17443 SDValue VOp0 = V.getOperand(0);
17444 unsigned InBits = VOp0.getValueSizeInBits();
17445 unsigned Bits = V.getValueSizeInBits();
17446 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17449 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17450 bool AddTest = true;
17451 SDValue Cond = Op.getOperand(0);
17452 SDValue Op1 = Op.getOperand(1);
17453 SDValue Op2 = Op.getOperand(2);
17455 MVT VT = Op1.getSimpleValueType();
17458 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17459 // are available or VBLENDV if AVX is available.
17460 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17461 if (Cond.getOpcode() == ISD::SETCC &&
17462 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17463 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17464 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17465 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17466 int SSECC = translateX86FSETCC(
17467 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17470 if (Subtarget.hasAVX512()) {
17471 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
17472 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17473 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17474 DL, VT, Cmp, Op1, Op2);
17477 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17478 DAG.getConstant(SSECC, DL, MVT::i8));
17480 // If we have AVX, we can use a variable vector select (VBLENDV) instead
17481 // of 3 logic instructions for size savings and potentially speed.
17482 // Unfortunately, there is no scalar form of VBLENDV.
17484 // If either operand is a constant, don't try this. We can expect to
17485 // optimize away at least one of the logic instructions later in that
17486 // case, so that sequence would be faster than a variable blend.
17488 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17489 // uses XMM0 as the selection register. That may need just as many
17490 // instructions as the AND/ANDN/OR sequence due to register moves, so
17493 if (Subtarget.hasAVX() &&
17494 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17496 // Convert to vectors, do a VSELECT, and convert back to scalar.
17497 // All of the conversions should be optimized away.
17499 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17500 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17501 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17502 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17504 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17505 VCmp = DAG.getBitcast(VCmpVT, VCmp);
17507 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
17509 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17510 VSel, DAG.getIntPtrConstant(0, DL));
17512 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17513 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17514 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17518 // AVX512 fallback is to lower selects of scalar floats to masked moves.
17519 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
17520 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
17521 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17524 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17526 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17527 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17528 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17529 Op1Scalar = Op1.getOperand(0);
17531 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17532 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17533 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17534 Op2Scalar = Op2.getOperand(0);
17535 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17536 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
17537 Op1Scalar, Op2Scalar);
17538 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17539 return DAG.getBitcast(VT, newSelect);
17540 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17541 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17542 DAG.getIntPtrConstant(0, DL));
17546 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17547 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17548 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17549 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17550 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17551 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17552 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
17553 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17556 if (Cond.getOpcode() == ISD::SETCC) {
17557 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17559 // If the condition was updated, it's possible that the operands of the
17560 // select were also updated (for example, EmitTest has a RAUW). Refresh
17561 // the local references to the select operands in case they got stale.
17562 Op1 = Op.getOperand(1);
17563 Op2 = Op.getOperand(2);
17567 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17568 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17569 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17570 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17571 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17572 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17573 if (Cond.getOpcode() == X86ISD::SETCC &&
17574 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17575 isNullConstant(Cond.getOperand(1).getOperand(1))) {
17576 SDValue Cmp = Cond.getOperand(1);
17577 unsigned CondCode =
17578 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17580 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17581 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17582 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17584 SDValue CmpOp0 = Cmp.getOperand(0);
17585 // Apply further optimizations for special cases
17586 // (select (x != 0), -1, 0) -> neg & sbb
17587 // (select (x == 0), 0, -1) -> neg & sbb
17588 if (isNullConstant(Y) &&
17589 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17590 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17591 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
17592 DAG.getConstant(0, DL,
17593 CmpOp0.getValueType()),
17595 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17596 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17597 SDValue(Neg.getNode(), 1));
17601 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17602 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17603 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17605 SDValue Res = // Res = 0 or -1.
17606 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17607 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17609 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17610 Res = DAG.getNOT(DL, Res, Res.getValueType());
17612 if (!isNullConstant(Op2))
17613 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17615 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17616 Cmp.getOperand(0).getOpcode() == ISD::AND &&
17617 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17618 SDValue CmpOp0 = Cmp.getOperand(0);
17619 SDValue Src1, Src2;
17620 // true if Op2 is XOR or OR operator and one of its operands
17622 // ( a , a op b) || ( b , a op b)
17623 auto isOrXorPattern = [&]() {
17624 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
17625 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
17627 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17634 if (isOrXorPattern()) {
17636 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17637 // we need mask of all zeros or ones with same size of the other
17639 if (CmpSz > VT.getSizeInBits())
17640 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17641 else if (CmpSz < VT.getSizeInBits())
17642 Neg = DAG.getNode(ISD::AND, DL, VT,
17643 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17644 DAG.getConstant(1, DL, VT));
17647 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17648 Neg); // -(and (x, 0x1))
17649 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17650 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
17655 // Look past (and (setcc_carry (cmp ...)), 1).
17656 if (Cond.getOpcode() == ISD::AND &&
17657 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17658 isOneConstant(Cond.getOperand(1)))
17659 Cond = Cond.getOperand(0);
17661 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17662 // setting operand in place of the X86ISD::SETCC.
17663 unsigned CondOpcode = Cond.getOpcode();
17664 if (CondOpcode == X86ISD::SETCC ||
17665 CondOpcode == X86ISD::SETCC_CARRY) {
17666 CC = Cond.getOperand(0);
17668 SDValue Cmp = Cond.getOperand(1);
17669 unsigned Opc = Cmp.getOpcode();
17670 MVT VT = Op.getSimpleValueType();
17672 bool IllegalFPCMov = false;
17673 if (VT.isFloatingPoint() && !VT.isVector() &&
17674 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17675 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17677 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17678 Opc == X86ISD::BT) { // FIXME
17682 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17683 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17684 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17685 Cond.getOperand(0).getValueType() != MVT::i8)) {
17686 SDValue LHS = Cond.getOperand(0);
17687 SDValue RHS = Cond.getOperand(1);
17688 unsigned X86Opcode;
17691 switch (CondOpcode) {
17692 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17693 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17694 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17695 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17696 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17697 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17698 default: llvm_unreachable("unexpected overflowing operator");
17700 if (CondOpcode == ISD::UMULO)
17701 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17704 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17706 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17708 if (CondOpcode == ISD::UMULO)
17709 Cond = X86Op.getValue(2);
17711 Cond = X86Op.getValue(1);
17713 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17718 // Look past the truncate if the high bits are known zero.
17719 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17720 Cond = Cond.getOperand(0);
17722 // We know the result of AND is compared against zero. Try to match
17724 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17725 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17726 CC = NewSetCC.getOperand(0);
17727 Cond = NewSetCC.getOperand(1);
17734 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17735 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17738 // a < b ? -1 : 0 -> RES = ~setcc_carry
17739 // a < b ? 0 : -1 -> RES = setcc_carry
17740 // a >= b ? -1 : 0 -> RES = setcc_carry
17741 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17742 if (Cond.getOpcode() == X86ISD::SUB) {
17743 Cond = ConvertCmpIfNecessary(Cond, DAG);
17744 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17746 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17747 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17748 (isNullConstant(Op1) || isNullConstant(Op2))) {
17749 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17750 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17752 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17753 return DAG.getNOT(DL, Res, Res.getValueType());
17758 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17759 // widen the cmov and push the truncate through. This avoids introducing a new
17760 // branch during isel and doesn't add any extensions.
17761 if (Op.getValueType() == MVT::i8 &&
17762 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17763 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17764 if (T1.getValueType() == T2.getValueType() &&
17765 // Blacklist CopyFromReg to avoid partial register stalls.
17766 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17767 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17768 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17769 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17773 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17774 // condition is true.
17775 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17776 SDValue Ops[] = { Op2, Op1, CC, Cond };
17777 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17780 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17781 const X86Subtarget &Subtarget,
17782 SelectionDAG &DAG) {
17783 MVT VT = Op->getSimpleValueType(0);
17784 SDValue In = Op->getOperand(0);
17785 MVT InVT = In.getSimpleValueType();
17786 MVT VTElt = VT.getVectorElementType();
17787 MVT InVTElt = InVT.getVectorElementType();
17791 if ((InVTElt == MVT::i1) &&
17792 (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
17794 ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
17796 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17798 unsigned NumElts = VT.getVectorNumElements();
17800 if (VT.is512BitVector() && InVTElt != MVT::i1 &&
17801 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
17802 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17803 return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
17804 return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
17807 if (InVTElt != MVT::i1)
17811 if (!VT.is512BitVector() && !Subtarget.hasVLX())
17812 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17815 if (Subtarget.hasDQI()) {
17816 V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
17817 assert(!VT.is512BitVector() && "Unexpected vector type");
17819 SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
17820 SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
17821 V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
17826 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17829 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17830 // For sign extend this needs to handle all vector sizes and SSE4.1 and
17831 // non-SSE4.1 targets. For zero extend this should only handle inputs of
17832 // MVT::v64i8 when BWI is not supported, but AVX512 is.
17833 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17834 const X86Subtarget &Subtarget,
17835 SelectionDAG &DAG) {
17836 SDValue In = Op->getOperand(0);
17837 MVT VT = Op->getSimpleValueType(0);
17838 MVT InVT = In.getSimpleValueType();
17839 assert(VT.getSizeInBits() == InVT.getSizeInBits());
17841 MVT SVT = VT.getVectorElementType();
17842 MVT InSVT = InVT.getVectorElementType();
17843 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17845 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17847 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17849 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17850 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
17851 !(VT.is512BitVector() && Subtarget.hasAVX512()))
17856 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17857 // For 512-bit vectors, we need 128-bits or 256-bits.
17858 if (VT.getSizeInBits() > 128) {
17859 // Input needs to be at least the same number of elements as output, and
17860 // at least 128-bits.
17861 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17862 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17865 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17866 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17868 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
17869 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
17870 // need to be handled here for 256/512-bit results.
17871 if (Subtarget.hasInt256()) {
17872 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
17873 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17874 X86ISD::VSEXT : X86ISD::VZEXT;
17875 return DAG.getNode(ExtOpc, dl, VT, In);
17878 // We should only get here for sign extend.
17879 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17880 "Unexpected opcode!");
17882 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
17886 // As SRAI is only available on i16/i32 types, we expand only up to i32
17887 // and handle i64 separately.
17888 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
17889 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
17890 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
17891 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
17892 Curr = DAG.getBitcast(CurrVT, Curr);
17895 SDValue SignExt = Curr;
17896 if (CurrVT != InVT) {
17897 unsigned SignExtShift =
17898 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
17899 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17900 DAG.getConstant(SignExtShift, dl, MVT::i8));
17906 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
17907 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17908 DAG.getConstant(31, dl, MVT::i8));
17909 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
17910 return DAG.getBitcast(VT, Ext);
17916 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17917 SelectionDAG &DAG) {
17918 MVT VT = Op->getSimpleValueType(0);
17919 SDValue In = Op->getOperand(0);
17920 MVT InVT = In.getSimpleValueType();
17923 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
17924 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
17926 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
17927 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
17928 (VT != MVT::v16i16 || InVT != MVT::v16i8))
17931 if (Subtarget.hasInt256())
17932 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17934 // Optimize vectors in AVX mode
17935 // Sign extend v8i16 to v8i32 and
17938 // Divide input vector into two parts
17939 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
17940 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
17941 // concat the vectors to original VT
17943 unsigned NumElems = InVT.getVectorNumElements();
17944 SDValue Undef = DAG.getUNDEF(InVT);
17946 SmallVector<int,8> ShufMask1(NumElems, -1);
17947 for (unsigned i = 0; i != NumElems/2; ++i)
17950 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
17952 SmallVector<int,8> ShufMask2(NumElems, -1);
17953 for (unsigned i = 0; i != NumElems/2; ++i)
17954 ShufMask2[i] = i + NumElems/2;
17956 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
17958 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
17959 VT.getVectorNumElements() / 2);
17961 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
17962 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
17964 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
17967 // Lower truncating store. We need a special lowering to vXi1 vectors
17968 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
17969 SelectionDAG &DAG) {
17970 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
17972 EVT MemVT = St->getMemoryVT();
17973 assert(St->isTruncatingStore() && "We only custom truncating store.");
17974 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
17975 "Expected truncstore of i1 vector");
17977 SDValue Op = St->getValue();
17978 MVT OpVT = Op.getValueType().getSimpleVT();
17979 unsigned NumElts = OpVT.getVectorNumElements();
17980 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17982 // Truncate and store - everything is legal
17983 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
17984 if (MemVT.getSizeInBits() < 8)
17985 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
17986 DAG.getUNDEF(MVT::v8i1), Op,
17987 DAG.getIntPtrConstant(0, dl));
17988 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17989 St->getMemOperand());
17992 // A subset, assume that we have only AVX-512F
17993 if (NumElts <= 8) {
17995 // Extend to 8-elts vector
17996 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
17997 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
17998 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18000 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18001 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18002 St->getMemOperand());
18005 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18006 // Divide the vector into 2 parts and store each part separately
18007 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18008 DAG.getIntPtrConstant(0, dl));
18009 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18010 SDValue BasePtr = St->getBasePtr();
18011 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18012 St->getMemOperand());
18013 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18014 DAG.getIntPtrConstant(16, dl));
18015 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18017 SDValue BasePtrHi =
18018 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18019 DAG.getConstant(2, dl, BasePtr.getValueType()));
18021 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18022 BasePtrHi, St->getMemOperand());
18023 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18026 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18027 const X86Subtarget &Subtarget,
18028 SelectionDAG &DAG) {
18030 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18032 EVT MemVT = Ld->getMemoryVT();
18033 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18034 "Expected i1 vector load");
18035 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18036 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18037 MVT VT = Op.getValueType().getSimpleVT();
18038 unsigned NumElts = VT.getVectorNumElements();
18040 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18041 (Subtarget.hasDQI() && NumElts < 16) ||
18043 // Load and extend - everything is legal
18045 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18047 Ld->getMemOperand());
18048 // Replace chain users with the new chain.
18049 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18050 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18051 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18052 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18054 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18055 DAG.getIntPtrConstant(0, dl));
18057 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18059 Ld->getMemOperand());
18060 // Replace chain users with the new chain.
18061 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18062 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18064 // Finally, do a normal sign-extend to the desired register.
18065 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18068 if (NumElts <= 8) {
18069 // A subset, assume that we have only AVX-512F
18070 unsigned NumBitsToLoad = 8;
18071 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18072 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18074 Ld->getMemOperand());
18075 // Replace chain users with the new chain.
18076 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18077 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18079 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18080 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18083 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18085 // we should take care to v4i1 and v2i1
18087 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18088 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18089 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18090 DAG.getIntPtrConstant(0, dl));
18093 assert(VT == MVT::v32i8 && "Unexpected extload type");
18095 SmallVector<SDValue, 2> Chains;
18097 SDValue BasePtr = Ld->getBasePtr();
18098 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18100 Ld->getMemOperand());
18101 Chains.push_back(LoadLo.getValue(1));
18103 SDValue BasePtrHi =
18104 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18105 DAG.getConstant(2, dl, BasePtr.getValueType()));
18107 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18109 Ld->getMemOperand());
18110 Chains.push_back(LoadHi.getValue(1));
18111 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18112 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18114 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18115 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18116 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18119 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18120 // may emit an illegal shuffle but the expansion is still better than scalar
18121 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18122 // we'll emit a shuffle and a arithmetic shift.
18123 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18124 // TODO: It is possible to support ZExt by zeroing the undef values during
18125 // the shuffle phase or after the shuffle.
18126 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18127 SelectionDAG &DAG) {
18128 MVT RegVT = Op.getSimpleValueType();
18129 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18130 assert(RegVT.isInteger() &&
18131 "We only custom lower integer vector sext loads.");
18133 // Nothing useful we can do without SSE2 shuffles.
18134 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18136 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18138 EVT MemVT = Ld->getMemoryVT();
18139 if (MemVT.getScalarType() == MVT::i1)
18140 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18142 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18143 unsigned RegSz = RegVT.getSizeInBits();
18145 ISD::LoadExtType Ext = Ld->getExtensionType();
18147 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18148 && "Only anyext and sext are currently implemented.");
18149 assert(MemVT != RegVT && "Cannot extend to the same type");
18150 assert(MemVT.isVector() && "Must load a vector from memory");
18152 unsigned NumElems = RegVT.getVectorNumElements();
18153 unsigned MemSz = MemVT.getSizeInBits();
18154 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18156 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18157 // The only way in which we have a legal 256-bit vector result but not the
18158 // integer 256-bit operations needed to directly lower a sextload is if we
18159 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18160 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18161 // correctly legalized. We do this late to allow the canonical form of
18162 // sextload to persist throughout the rest of the DAG combiner -- it wants
18163 // to fold together any extensions it can, and so will fuse a sign_extend
18164 // of an sextload into a sextload targeting a wider value.
18166 if (MemSz == 128) {
18167 // Just switch this to a normal load.
18168 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18169 "it must be a legal 128-bit vector "
18171 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18172 Ld->getPointerInfo(), Ld->getAlignment(),
18173 Ld->getMemOperand()->getFlags());
18175 assert(MemSz < 128 &&
18176 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18177 // Do an sext load to a 128-bit vector type. We want to use the same
18178 // number of elements, but elements half as wide. This will end up being
18179 // recursively lowered by this routine, but will succeed as we definitely
18180 // have all the necessary features if we're using AVX1.
18182 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18183 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18185 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18186 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18187 Ld->getMemOperand()->getFlags());
18190 // Replace chain users with the new chain.
18191 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18192 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18194 // Finally, do a normal sign-extend to the desired register.
18195 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18198 // All sizes must be a power of two.
18199 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18200 "Non-power-of-two elements are not custom lowered!");
18202 // Attempt to load the original value using scalar loads.
18203 // Find the largest scalar type that divides the total loaded size.
18204 MVT SclrLoadTy = MVT::i8;
18205 for (MVT Tp : MVT::integer_valuetypes()) {
18206 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18211 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18212 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18214 SclrLoadTy = MVT::f64;
18216 // Calculate the number of scalar loads that we need to perform
18217 // in order to load our vector from memory.
18218 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18220 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18221 "Can only lower sext loads with a single scalar load!");
18223 unsigned loadRegZize = RegSz;
18224 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18227 // Represent our vector as a sequence of elements which are the
18228 // largest scalar that we can load.
18229 EVT LoadUnitVecVT = EVT::getVectorVT(
18230 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18232 // Represent the data using the same element type that is stored in
18233 // memory. In practice, we ''widen'' MemVT.
18235 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18236 loadRegZize / MemVT.getScalarSizeInBits());
18238 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18239 "Invalid vector type");
18241 // We can't shuffle using an illegal type.
18242 assert(TLI.isTypeLegal(WideVecVT) &&
18243 "We only lower types that form legal widened vector types");
18245 SmallVector<SDValue, 8> Chains;
18246 SDValue Ptr = Ld->getBasePtr();
18247 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18248 TLI.getPointerTy(DAG.getDataLayout()));
18249 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18251 for (unsigned i = 0; i < NumLoads; ++i) {
18252 // Perform a single load.
18253 SDValue ScalarLoad =
18254 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18255 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18256 Chains.push_back(ScalarLoad.getValue(1));
18257 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18258 // another round of DAGCombining.
18260 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18262 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18263 ScalarLoad, DAG.getIntPtrConstant(i, dl));
18265 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18268 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18270 // Bitcast the loaded value to a vector of the original element type, in
18271 // the size of the target vector type.
18272 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18273 unsigned SizeRatio = RegSz / MemSz;
18275 if (Ext == ISD::SEXTLOAD) {
18276 // If we have SSE4.1, we can directly emit a VSEXT node.
18277 if (Subtarget.hasSSE41()) {
18278 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18279 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18283 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18285 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18286 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18288 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18289 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18293 // Redistribute the loaded elements into the different locations.
18294 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18295 for (unsigned i = 0; i != NumElems; ++i)
18296 ShuffleVec[i * SizeRatio] = i;
18298 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18299 DAG.getUNDEF(WideVecVT), ShuffleVec);
18301 // Bitcast to the requested type.
18302 Shuff = DAG.getBitcast(RegVT, Shuff);
18303 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18307 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18308 /// each of which has no other use apart from the AND / OR.
18309 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18310 Opc = Op.getOpcode();
18311 if (Opc != ISD::OR && Opc != ISD::AND)
18313 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18314 Op.getOperand(0).hasOneUse() &&
18315 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18316 Op.getOperand(1).hasOneUse());
18319 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18320 /// SETCC node has a single use.
18321 static bool isXor1OfSetCC(SDValue Op) {
18322 if (Op.getOpcode() != ISD::XOR)
18324 if (isOneConstant(Op.getOperand(1)))
18325 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18326 Op.getOperand(0).hasOneUse();
18330 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18331 bool addTest = true;
18332 SDValue Chain = Op.getOperand(0);
18333 SDValue Cond = Op.getOperand(1);
18334 SDValue Dest = Op.getOperand(2);
18337 bool Inverted = false;
18339 if (Cond.getOpcode() == ISD::SETCC) {
18340 // Check for setcc([su]{add,sub,mul}o == 0).
18341 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18342 isNullConstant(Cond.getOperand(1)) &&
18343 Cond.getOperand(0).getResNo() == 1 &&
18344 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18345 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18346 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18347 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18348 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18349 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18351 Cond = Cond.getOperand(0);
18353 if (SDValue NewCond = LowerSETCC(Cond, DAG))
18358 // FIXME: LowerXALUO doesn't handle these!!
18359 else if (Cond.getOpcode() == X86ISD::ADD ||
18360 Cond.getOpcode() == X86ISD::SUB ||
18361 Cond.getOpcode() == X86ISD::SMUL ||
18362 Cond.getOpcode() == X86ISD::UMUL)
18363 Cond = LowerXALUO(Cond, DAG);
18366 // Look pass (and (setcc_carry (cmp ...)), 1).
18367 if (Cond.getOpcode() == ISD::AND &&
18368 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18369 isOneConstant(Cond.getOperand(1)))
18370 Cond = Cond.getOperand(0);
18372 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18373 // setting operand in place of the X86ISD::SETCC.
18374 unsigned CondOpcode = Cond.getOpcode();
18375 if (CondOpcode == X86ISD::SETCC ||
18376 CondOpcode == X86ISD::SETCC_CARRY) {
18377 CC = Cond.getOperand(0);
18379 SDValue Cmp = Cond.getOperand(1);
18380 unsigned Opc = Cmp.getOpcode();
18381 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18382 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18386 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18390 // These can only come from an arithmetic instruction with overflow,
18391 // e.g. SADDO, UADDO.
18392 Cond = Cond.getOperand(1);
18398 CondOpcode = Cond.getOpcode();
18399 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18400 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18401 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18402 Cond.getOperand(0).getValueType() != MVT::i8)) {
18403 SDValue LHS = Cond.getOperand(0);
18404 SDValue RHS = Cond.getOperand(1);
18405 unsigned X86Opcode;
18408 // Keep this in sync with LowerXALUO, otherwise we might create redundant
18409 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18411 switch (CondOpcode) {
18412 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18414 if (isOneConstant(RHS)) {
18415 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18418 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18419 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18421 if (isOneConstant(RHS)) {
18422 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18425 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18426 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18427 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18428 default: llvm_unreachable("unexpected overflowing operator");
18431 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18432 if (CondOpcode == ISD::UMULO)
18433 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18436 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18438 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18440 if (CondOpcode == ISD::UMULO)
18441 Cond = X86Op.getValue(2);
18443 Cond = X86Op.getValue(1);
18445 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18449 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18450 SDValue Cmp = Cond.getOperand(0).getOperand(1);
18451 if (CondOpc == ISD::OR) {
18452 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18453 // two branches instead of an explicit OR instruction with a
18455 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18456 isX86LogicalCmp(Cmp)) {
18457 CC = Cond.getOperand(0).getOperand(0);
18458 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18459 Chain, Dest, CC, Cmp);
18460 CC = Cond.getOperand(1).getOperand(0);
18464 } else { // ISD::AND
18465 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18466 // two branches instead of an explicit AND instruction with a
18467 // separate test. However, we only do this if this block doesn't
18468 // have a fall-through edge, because this requires an explicit
18469 // jmp when the condition is false.
18470 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18471 isX86LogicalCmp(Cmp) &&
18472 Op.getNode()->hasOneUse()) {
18473 X86::CondCode CCode =
18474 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18475 CCode = X86::GetOppositeBranchCondition(CCode);
18476 CC = DAG.getConstant(CCode, dl, MVT::i8);
18477 SDNode *User = *Op.getNode()->use_begin();
18478 // Look for an unconditional branch following this conditional branch.
18479 // We need this because we need to reverse the successors in order
18480 // to implement FCMP_OEQ.
18481 if (User->getOpcode() == ISD::BR) {
18482 SDValue FalseBB = User->getOperand(1);
18484 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18485 assert(NewBR == User);
18489 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18490 Chain, Dest, CC, Cmp);
18491 X86::CondCode CCode =
18492 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18493 CCode = X86::GetOppositeBranchCondition(CCode);
18494 CC = DAG.getConstant(CCode, dl, MVT::i8);
18500 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18501 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18502 // It should be transformed during dag combiner except when the condition
18503 // is set by a arithmetics with overflow node.
18504 X86::CondCode CCode =
18505 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18506 CCode = X86::GetOppositeBranchCondition(CCode);
18507 CC = DAG.getConstant(CCode, dl, MVT::i8);
18508 Cond = Cond.getOperand(0).getOperand(1);
18510 } else if (Cond.getOpcode() == ISD::SETCC &&
18511 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18512 // For FCMP_OEQ, we can emit
18513 // two branches instead of an explicit AND instruction with a
18514 // separate test. However, we only do this if this block doesn't
18515 // have a fall-through edge, because this requires an explicit
18516 // jmp when the condition is false.
18517 if (Op.getNode()->hasOneUse()) {
18518 SDNode *User = *Op.getNode()->use_begin();
18519 // Look for an unconditional branch following this conditional branch.
18520 // We need this because we need to reverse the successors in order
18521 // to implement FCMP_OEQ.
18522 if (User->getOpcode() == ISD::BR) {
18523 SDValue FalseBB = User->getOperand(1);
18525 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18526 assert(NewBR == User);
18530 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18531 Cond.getOperand(0), Cond.getOperand(1));
18532 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18533 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18534 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18535 Chain, Dest, CC, Cmp);
18536 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18541 } else if (Cond.getOpcode() == ISD::SETCC &&
18542 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18543 // For FCMP_UNE, we can emit
18544 // two branches instead of an explicit AND instruction with a
18545 // separate test. However, we only do this if this block doesn't
18546 // have a fall-through edge, because this requires an explicit
18547 // jmp when the condition is false.
18548 if (Op.getNode()->hasOneUse()) {
18549 SDNode *User = *Op.getNode()->use_begin();
18550 // Look for an unconditional branch following this conditional branch.
18551 // We need this because we need to reverse the successors in order
18552 // to implement FCMP_UNE.
18553 if (User->getOpcode() == ISD::BR) {
18554 SDValue FalseBB = User->getOperand(1);
18556 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18557 assert(NewBR == User);
18560 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18561 Cond.getOperand(0), Cond.getOperand(1));
18562 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18563 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18564 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18565 Chain, Dest, CC, Cmp);
18566 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18576 // Look pass the truncate if the high bits are known zero.
18577 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18578 Cond = Cond.getOperand(0);
18580 // We know the result is compared against zero. Try to match it to BT.
18581 if (Cond.hasOneUse()) {
18582 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18583 CC = NewSetCC.getOperand(0);
18584 Cond = NewSetCC.getOperand(1);
18591 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18592 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18593 Cond = EmitTest(Cond, X86Cond, dl, DAG);
18595 Cond = ConvertCmpIfNecessary(Cond, DAG);
18596 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18597 Chain, Dest, CC, Cond);
18600 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18601 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18602 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18603 // that the guard pages used by the OS virtual memory manager are allocated in
18604 // correct sequence.
18606 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18607 SelectionDAG &DAG) const {
18608 MachineFunction &MF = DAG.getMachineFunction();
18609 bool SplitStack = MF.shouldSplitStack();
18610 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18615 SDNode *Node = Op.getNode();
18616 SDValue Chain = Op.getOperand(0);
18617 SDValue Size = Op.getOperand(1);
18618 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18619 EVT VT = Node->getValueType(0);
18621 // Chain the dynamic stack allocation so that it doesn't modify the stack
18622 // pointer when other instructions are using the stack.
18623 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
18625 bool Is64Bit = Subtarget.is64Bit();
18626 MVT SPTy = getPointerTy(DAG.getDataLayout());
18630 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18631 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18632 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18633 " not tell us which reg is the stack pointer!");
18635 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18636 Chain = SP.getValue(1);
18637 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18638 unsigned StackAlign = TFI.getStackAlignment();
18639 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18640 if (Align > StackAlign)
18641 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18642 DAG.getConstant(-(uint64_t)Align, dl, VT));
18643 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18644 } else if (SplitStack) {
18645 MachineRegisterInfo &MRI = MF.getRegInfo();
18648 // The 64 bit implementation of segmented stacks needs to clobber both r10
18649 // r11. This makes it impossible to use it along with nested parameters.
18650 const Function *F = MF.getFunction();
18651 for (const auto &A : F->args()) {
18652 if (A.hasNestAttr())
18653 report_fatal_error("Cannot use segmented stacks with functions that "
18654 "have nested arguments.");
18658 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18659 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18660 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18661 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18662 DAG.getRegister(Vreg, SPTy));
18664 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18665 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18666 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18668 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18669 unsigned SPReg = RegInfo->getStackRegister();
18670 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18671 Chain = SP.getValue(1);
18674 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18675 DAG.getConstant(-(uint64_t)Align, dl, VT));
18676 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18682 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18683 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18685 SDValue Ops[2] = {Result, Chain};
18686 return DAG.getMergeValues(Ops, dl);
18689 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18690 MachineFunction &MF = DAG.getMachineFunction();
18691 auto PtrVT = getPointerTy(MF.getDataLayout());
18692 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18694 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18697 if (!Subtarget.is64Bit() ||
18698 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18699 // vastart just stores the address of the VarArgsFrameIndex slot into the
18700 // memory location argument.
18701 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18702 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18703 MachinePointerInfo(SV));
18707 // gp_offset (0 - 6 * 8)
18708 // fp_offset (48 - 48 + 8 * 16)
18709 // overflow_arg_area (point to parameters coming in memory).
18711 SmallVector<SDValue, 8> MemOps;
18712 SDValue FIN = Op.getOperand(1);
18714 SDValue Store = DAG.getStore(
18715 Op.getOperand(0), DL,
18716 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18717 MachinePointerInfo(SV));
18718 MemOps.push_back(Store);
18721 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18722 Store = DAG.getStore(
18723 Op.getOperand(0), DL,
18724 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18725 MachinePointerInfo(SV, 4));
18726 MemOps.push_back(Store);
18728 // Store ptr to overflow_arg_area
18729 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18730 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18732 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18733 MemOps.push_back(Store);
18735 // Store ptr to reg_save_area.
18736 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18737 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18738 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18739 Store = DAG.getStore(
18740 Op.getOperand(0), DL, RSFIN, FIN,
18741 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18742 MemOps.push_back(Store);
18743 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18746 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18747 assert(Subtarget.is64Bit() &&
18748 "LowerVAARG only handles 64-bit va_arg!");
18749 assert(Op.getNumOperands() == 4);
18751 MachineFunction &MF = DAG.getMachineFunction();
18752 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18753 // The Win64 ABI uses char* instead of a structure.
18754 return DAG.expandVAArg(Op.getNode());
18756 SDValue Chain = Op.getOperand(0);
18757 SDValue SrcPtr = Op.getOperand(1);
18758 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18759 unsigned Align = Op.getConstantOperandVal(3);
18762 EVT ArgVT = Op.getNode()->getValueType(0);
18763 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18764 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18767 // Decide which area this value should be read from.
18768 // TODO: Implement the AMD64 ABI in its entirety. This simple
18769 // selection mechanism works only for the basic types.
18770 if (ArgVT == MVT::f80) {
18771 llvm_unreachable("va_arg for f80 not yet implemented");
18772 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18773 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18774 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18775 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18777 llvm_unreachable("Unhandled argument type in LowerVAARG");
18780 if (ArgMode == 2) {
18781 // Sanity Check: Make sure using fp_offset makes sense.
18782 assert(!Subtarget.useSoftFloat() &&
18783 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18784 Subtarget.hasSSE1());
18787 // Insert VAARG_64 node into the DAG
18788 // VAARG_64 returns two values: Variable Argument Address, Chain
18789 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18790 DAG.getConstant(ArgMode, dl, MVT::i8),
18791 DAG.getConstant(Align, dl, MVT::i32)};
18792 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18793 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18794 VTs, InstOps, MVT::i64,
18795 MachinePointerInfo(SV),
18797 /*Volatile=*/false,
18799 /*WriteMem=*/true);
18800 Chain = VAARG.getValue(1);
18802 // Load the next argument and return it
18803 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18806 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18807 SelectionDAG &DAG) {
18808 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18809 // where a va_list is still an i8*.
18810 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18811 if (Subtarget.isCallingConvWin64(
18812 DAG.getMachineFunction().getFunction()->getCallingConv()))
18813 // Probably a Win64 va_copy.
18814 return DAG.expandVACopy(Op.getNode());
18816 SDValue Chain = Op.getOperand(0);
18817 SDValue DstPtr = Op.getOperand(1);
18818 SDValue SrcPtr = Op.getOperand(2);
18819 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18820 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18823 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18824 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18826 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18829 /// Handle vector element shifts where the shift amount is a constant.
18830 /// Takes immediate version of shift as input.
18831 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18832 SDValue SrcOp, uint64_t ShiftAmt,
18833 SelectionDAG &DAG) {
18834 MVT ElementType = VT.getVectorElementType();
18836 // Bitcast the source vector to the output type, this is mainly necessary for
18837 // vXi8/vXi64 shifts.
18838 if (VT != SrcOp.getSimpleValueType())
18839 SrcOp = DAG.getBitcast(VT, SrcOp);
18841 // Fold this packed shift into its first operand if ShiftAmt is 0.
18845 // Check for ShiftAmt >= element width
18846 if (ShiftAmt >= ElementType.getSizeInBits()) {
18847 if (Opc == X86ISD::VSRAI)
18848 ShiftAmt = ElementType.getSizeInBits() - 1;
18850 return DAG.getConstant(0, dl, VT);
18853 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18854 && "Unknown target vector shift-by-constant node");
18856 // Fold this packed vector shift into a build vector if SrcOp is a
18857 // vector of Constants or UNDEFs.
18858 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18859 SmallVector<SDValue, 8> Elts;
18860 unsigned NumElts = SrcOp->getNumOperands();
18861 ConstantSDNode *ND;
18864 default: llvm_unreachable("Unknown opcode!");
18865 case X86ISD::VSHLI:
18866 for (unsigned i=0; i!=NumElts; ++i) {
18867 SDValue CurrentOp = SrcOp->getOperand(i);
18868 if (CurrentOp->isUndef()) {
18869 Elts.push_back(CurrentOp);
18872 ND = cast<ConstantSDNode>(CurrentOp);
18873 const APInt &C = ND->getAPIntValue();
18874 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18877 case X86ISD::VSRLI:
18878 for (unsigned i=0; i!=NumElts; ++i) {
18879 SDValue CurrentOp = SrcOp->getOperand(i);
18880 if (CurrentOp->isUndef()) {
18881 Elts.push_back(CurrentOp);
18884 ND = cast<ConstantSDNode>(CurrentOp);
18885 const APInt &C = ND->getAPIntValue();
18886 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
18889 case X86ISD::VSRAI:
18890 for (unsigned i=0; i!=NumElts; ++i) {
18891 SDValue CurrentOp = SrcOp->getOperand(i);
18892 if (CurrentOp->isUndef()) {
18893 Elts.push_back(CurrentOp);
18896 ND = cast<ConstantSDNode>(CurrentOp);
18897 const APInt &C = ND->getAPIntValue();
18898 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
18903 return DAG.getBuildVector(VT, dl, Elts);
18906 return DAG.getNode(Opc, dl, VT, SrcOp,
18907 DAG.getConstant(ShiftAmt, dl, MVT::i8));
18910 /// Handle vector element shifts where the shift amount may or may not be a
18911 /// constant. Takes immediate version of shift as input.
18912 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
18913 SDValue SrcOp, SDValue ShAmt,
18914 const X86Subtarget &Subtarget,
18915 SelectionDAG &DAG) {
18916 MVT SVT = ShAmt.getSimpleValueType();
18917 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
18919 // Catch shift-by-constant.
18920 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
18921 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
18922 CShAmt->getZExtValue(), DAG);
18924 // Change opcode to non-immediate version
18926 default: llvm_unreachable("Unknown target vector shift node");
18927 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
18928 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
18929 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
18932 // Need to build a vector containing shift amount.
18933 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
18934 // +=================+============+=======================================+
18935 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
18936 // +=================+============+=======================================+
18937 // | i64 | Yes, No | Use ShAmt as lowest elt |
18938 // | i32 | Yes | zero-extend in-reg |
18939 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
18940 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
18941 // +=================+============+=======================================+
18943 if (SVT == MVT::i64)
18944 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
18945 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
18946 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
18947 ShAmt = ShAmt.getOperand(0);
18948 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
18949 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18950 } else if (Subtarget.hasSSE41() &&
18951 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
18952 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
18953 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18955 SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
18956 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
18957 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
18960 // The return type has to be a 128-bit type with the same element
18961 // type as the input type.
18962 MVT EltVT = VT.getVectorElementType();
18963 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
18965 ShAmt = DAG.getBitcast(ShVT, ShAmt);
18966 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
18969 /// \brief Return Mask with the necessary casting or extending
18970 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
18971 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
18972 const X86Subtarget &Subtarget, SelectionDAG &DAG,
18975 if (isAllOnesConstant(Mask))
18976 return DAG.getTargetConstant(1, dl, MaskVT);
18977 if (X86::isZeroNode(Mask))
18978 return DAG.getTargetConstant(0, dl, MaskVT);
18980 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
18981 // Mask should be extended
18982 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
18983 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
18986 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
18987 if (MaskVT == MVT::v64i1) {
18988 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
18989 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
18991 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18992 DAG.getConstant(0, dl, MVT::i32));
18993 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18994 DAG.getConstant(1, dl, MVT::i32));
18996 Lo = DAG.getBitcast(MVT::v32i1, Lo);
18997 Hi = DAG.getBitcast(MVT::v32i1, Hi);
18999 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19001 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19003 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19004 return DAG.getBitcast(MaskVT,
19005 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19009 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19010 Mask.getSimpleValueType().getSizeInBits());
19011 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19012 // are extracted by EXTRACT_SUBVECTOR.
19013 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19014 DAG.getBitcast(BitcastVT, Mask),
19015 DAG.getIntPtrConstant(0, dl));
19019 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19020 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19021 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19022 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19023 SDValue PreservedSrc,
19024 const X86Subtarget &Subtarget,
19025 SelectionDAG &DAG) {
19026 MVT VT = Op.getSimpleValueType();
19027 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19028 unsigned OpcodeSelect = ISD::VSELECT;
19031 if (isAllOnesConstant(Mask))
19034 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19036 switch (Op.getOpcode()) {
19038 case X86ISD::PCMPEQM:
19039 case X86ISD::PCMPGTM:
19041 case X86ISD::CMPMU:
19042 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19043 case X86ISD::VFPCLASS:
19044 case X86ISD::VFPCLASSS:
19045 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19046 case X86ISD::VTRUNC:
19047 case X86ISD::VTRUNCS:
19048 case X86ISD::VTRUNCUS:
19049 case X86ISD::CVTPS2PH:
19050 // We can't use ISD::VSELECT here because it is not always "Legal"
19051 // for the destination type. For example vpmovqb require only AVX512
19052 // and vselect that can operate on byte element type require BWI
19053 OpcodeSelect = X86ISD::SELECT;
19056 if (PreservedSrc.isUndef())
19057 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19058 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19061 /// \brief Creates an SDNode for a predicated scalar operation.
19062 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19063 /// The mask is coming as MVT::i8 and it should be transformed
19064 /// to MVT::v1i1 while lowering masking intrinsics.
19065 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19066 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19067 /// for a scalar instruction.
19068 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19069 SDValue PreservedSrc,
19070 const X86Subtarget &Subtarget,
19071 SelectionDAG &DAG) {
19073 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19074 if (MaskConst->getZExtValue() & 0x1)
19077 MVT VT = Op.getSimpleValueType();
19080 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19081 if (Op.getOpcode() == X86ISD::FSETCCM ||
19082 Op.getOpcode() == X86ISD::FSETCCM_RND)
19083 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19084 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19085 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19087 if (PreservedSrc.isUndef())
19088 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19089 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19092 static int getSEHRegistrationNodeSize(const Function *Fn) {
19093 if (!Fn->hasPersonalityFn())
19094 report_fatal_error(
19095 "querying registration node size for function without personality");
19096 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19097 // WinEHStatePass for the full struct definition.
19098 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19099 case EHPersonality::MSVC_X86SEH: return 24;
19100 case EHPersonality::MSVC_CXX: return 16;
19103 report_fatal_error(
19104 "can only recover FP for 32-bit MSVC EH personality functions");
19107 /// When the MSVC runtime transfers control to us, either to an outlined
19108 /// function or when returning to a parent frame after catching an exception, we
19109 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19110 /// Here's the math:
19111 /// RegNodeBase = EntryEBP - RegNodeSize
19112 /// ParentFP = RegNodeBase - ParentFrameOffset
19113 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19114 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19115 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19116 SDValue EntryEBP) {
19117 MachineFunction &MF = DAG.getMachineFunction();
19120 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19121 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19123 // It's possible that the parent function no longer has a personality function
19124 // if the exceptional code was optimized away, in which case we just return
19125 // the incoming EBP.
19126 if (!Fn->hasPersonalityFn())
19129 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19130 // registration, or the .set_setframe offset.
19131 MCSymbol *OffsetSym =
19132 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19133 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19134 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19135 SDValue ParentFrameOffset =
19136 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19138 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19139 // prologue to RBP in the parent function.
19140 const X86Subtarget &Subtarget =
19141 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19142 if (Subtarget.is64Bit())
19143 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19145 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19146 // RegNodeBase = EntryEBP - RegNodeSize
19147 // ParentFP = RegNodeBase - ParentFrameOffset
19148 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19149 DAG.getConstant(RegNodeSize, dl, PtrVT));
19150 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19153 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19154 SelectionDAG &DAG) {
19155 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19156 auto isRoundModeCurDirection = [](SDValue Rnd) {
19157 if (!isa<ConstantSDNode>(Rnd))
19160 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19161 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19165 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19166 MVT VT = Op.getSimpleValueType();
19167 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19169 switch(IntrData->Type) {
19170 case INTR_TYPE_1OP:
19171 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19172 case INTR_TYPE_2OP:
19173 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19175 case INTR_TYPE_3OP:
19176 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19177 Op.getOperand(2), Op.getOperand(3));
19178 case INTR_TYPE_4OP:
19179 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19180 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19181 case INTR_TYPE_1OP_MASK_RM: {
19182 SDValue Src = Op.getOperand(1);
19183 SDValue PassThru = Op.getOperand(2);
19184 SDValue Mask = Op.getOperand(3);
19185 SDValue RoundingMode;
19186 // We always add rounding mode to the Node.
19187 // If the rounding mode is not specified, we add the
19188 // "current direction" mode.
19189 if (Op.getNumOperands() == 4)
19191 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19193 RoundingMode = Op.getOperand(4);
19194 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19195 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19197 Mask, PassThru, Subtarget, DAG);
19199 case INTR_TYPE_1OP_MASK: {
19200 SDValue Src = Op.getOperand(1);
19201 SDValue PassThru = Op.getOperand(2);
19202 SDValue Mask = Op.getOperand(3);
19203 // We add rounding mode to the Node when
19204 // - RM Opcode is specified and
19205 // - RM is not "current direction".
19206 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19207 if (IntrWithRoundingModeOpcode != 0) {
19208 SDValue Rnd = Op.getOperand(4);
19209 if (!isRoundModeCurDirection(Rnd)) {
19210 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19211 dl, Op.getValueType(),
19213 Mask, PassThru, Subtarget, DAG);
19216 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19217 Mask, PassThru, Subtarget, DAG);
19219 case INTR_TYPE_SCALAR_MASK: {
19220 SDValue Src1 = Op.getOperand(1);
19221 SDValue Src2 = Op.getOperand(2);
19222 SDValue passThru = Op.getOperand(3);
19223 SDValue Mask = Op.getOperand(4);
19224 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19225 if (IntrWithRoundingModeOpcode != 0) {
19226 SDValue Rnd = Op.getOperand(5);
19227 if (!isRoundModeCurDirection(Rnd))
19228 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19229 dl, VT, Src1, Src2, Rnd),
19230 Mask, passThru, Subtarget, DAG);
19232 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19233 Mask, passThru, Subtarget, DAG);
19235 case INTR_TYPE_SCALAR_MASK_RM: {
19236 SDValue Src1 = Op.getOperand(1);
19237 SDValue Src2 = Op.getOperand(2);
19238 SDValue Src0 = Op.getOperand(3);
19239 SDValue Mask = Op.getOperand(4);
19240 // There are 2 kinds of intrinsics in this group:
19241 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19242 // (2) With rounding mode and sae - 7 operands.
19243 if (Op.getNumOperands() == 6) {
19244 SDValue Sae = Op.getOperand(5);
19245 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19247 Mask, Src0, Subtarget, DAG);
19249 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19250 SDValue RoundingMode = Op.getOperand(5);
19251 SDValue Sae = Op.getOperand(6);
19252 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19253 RoundingMode, Sae),
19254 Mask, Src0, Subtarget, DAG);
19256 case INTR_TYPE_2OP_MASK:
19257 case INTR_TYPE_2OP_IMM8_MASK: {
19258 SDValue Src1 = Op.getOperand(1);
19259 SDValue Src2 = Op.getOperand(2);
19260 SDValue PassThru = Op.getOperand(3);
19261 SDValue Mask = Op.getOperand(4);
19263 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19264 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19266 // We specify 2 possible opcodes for intrinsics with rounding modes.
19267 // First, we check if the intrinsic may have non-default rounding mode,
19268 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19269 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19270 if (IntrWithRoundingModeOpcode != 0) {
19271 SDValue Rnd = Op.getOperand(5);
19272 if (!isRoundModeCurDirection(Rnd)) {
19273 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19274 dl, Op.getValueType(),
19276 Mask, PassThru, Subtarget, DAG);
19279 // TODO: Intrinsics should have fast-math-flags to propagate.
19280 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19281 Mask, PassThru, Subtarget, DAG);
19283 case INTR_TYPE_2OP_MASK_RM: {
19284 SDValue Src1 = Op.getOperand(1);
19285 SDValue Src2 = Op.getOperand(2);
19286 SDValue PassThru = Op.getOperand(3);
19287 SDValue Mask = Op.getOperand(4);
19288 // We specify 2 possible modes for intrinsics, with/without rounding
19290 // First, we check if the intrinsic have rounding mode (6 operands),
19291 // if not, we set rounding mode to "current".
19293 if (Op.getNumOperands() == 6)
19294 Rnd = Op.getOperand(5);
19296 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19297 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19299 Mask, PassThru, Subtarget, DAG);
19301 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19302 SDValue Src1 = Op.getOperand(1);
19303 SDValue Src2 = Op.getOperand(2);
19304 SDValue Src3 = Op.getOperand(3);
19305 SDValue PassThru = Op.getOperand(4);
19306 SDValue Mask = Op.getOperand(5);
19307 SDValue Sae = Op.getOperand(6);
19309 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19311 Mask, PassThru, Subtarget, DAG);
19313 case INTR_TYPE_3OP_MASK_RM: {
19314 SDValue Src1 = Op.getOperand(1);
19315 SDValue Src2 = Op.getOperand(2);
19316 SDValue Imm = Op.getOperand(3);
19317 SDValue PassThru = Op.getOperand(4);
19318 SDValue Mask = Op.getOperand(5);
19319 // We specify 2 possible modes for intrinsics, with/without rounding
19321 // First, we check if the intrinsic have rounding mode (7 operands),
19322 // if not, we set rounding mode to "current".
19324 if (Op.getNumOperands() == 7)
19325 Rnd = Op.getOperand(6);
19327 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19328 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19329 Src1, Src2, Imm, Rnd),
19330 Mask, PassThru, Subtarget, DAG);
19332 case INTR_TYPE_3OP_IMM8_MASK:
19333 case INTR_TYPE_3OP_MASK: {
19334 SDValue Src1 = Op.getOperand(1);
19335 SDValue Src2 = Op.getOperand(2);
19336 SDValue Src3 = Op.getOperand(3);
19337 SDValue PassThru = Op.getOperand(4);
19338 SDValue Mask = Op.getOperand(5);
19340 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19341 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19343 // We specify 2 possible opcodes for intrinsics with rounding modes.
19344 // First, we check if the intrinsic may have non-default rounding mode,
19345 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19346 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19347 if (IntrWithRoundingModeOpcode != 0) {
19348 SDValue Rnd = Op.getOperand(6);
19349 if (!isRoundModeCurDirection(Rnd)) {
19350 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19351 dl, Op.getValueType(),
19352 Src1, Src2, Src3, Rnd),
19353 Mask, PassThru, Subtarget, DAG);
19356 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19358 Mask, PassThru, Subtarget, DAG);
19360 case VPERM_2OP_MASK : {
19361 SDValue Src1 = Op.getOperand(1);
19362 SDValue Src2 = Op.getOperand(2);
19363 SDValue PassThru = Op.getOperand(3);
19364 SDValue Mask = Op.getOperand(4);
19366 // Swap Src1 and Src2 in the node creation
19367 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19368 Mask, PassThru, Subtarget, DAG);
19370 case VPERM_3OP_MASKZ:
19371 case VPERM_3OP_MASK:{
19372 MVT VT = Op.getSimpleValueType();
19373 // Src2 is the PassThru
19374 SDValue Src1 = Op.getOperand(1);
19375 // PassThru needs to be the same type as the destination in order
19376 // to pattern match correctly.
19377 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19378 SDValue Src3 = Op.getOperand(3);
19379 SDValue Mask = Op.getOperand(4);
19380 SDValue PassThru = SDValue();
19382 // set PassThru element
19383 if (IntrData->Type == VPERM_3OP_MASKZ)
19384 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19388 // Swap Src1 and Src2 in the node creation
19389 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19390 dl, Op.getValueType(),
19392 Mask, PassThru, Subtarget, DAG);
19396 case FMA_OP_MASK: {
19397 SDValue Src1 = Op.getOperand(1);
19398 SDValue Src2 = Op.getOperand(2);
19399 SDValue Src3 = Op.getOperand(3);
19400 SDValue Mask = Op.getOperand(4);
19401 MVT VT = Op.getSimpleValueType();
19402 SDValue PassThru = SDValue();
19404 // set PassThru element
19405 if (IntrData->Type == FMA_OP_MASKZ)
19406 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19407 else if (IntrData->Type == FMA_OP_MASK3)
19412 // We specify 2 possible opcodes for intrinsics with rounding modes.
19413 // First, we check if the intrinsic may have non-default rounding mode,
19414 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19415 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19416 if (IntrWithRoundingModeOpcode != 0) {
19417 SDValue Rnd = Op.getOperand(5);
19418 if (!isRoundModeCurDirection(Rnd))
19419 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19420 dl, Op.getValueType(),
19421 Src1, Src2, Src3, Rnd),
19422 Mask, PassThru, Subtarget, DAG);
19424 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19425 dl, Op.getValueType(),
19427 Mask, PassThru, Subtarget, DAG);
19429 case FMA_OP_SCALAR_MASK:
19430 case FMA_OP_SCALAR_MASK3:
19431 case FMA_OP_SCALAR_MASKZ: {
19432 SDValue Src1 = Op.getOperand(1);
19433 SDValue Src2 = Op.getOperand(2);
19434 SDValue Src3 = Op.getOperand(3);
19435 SDValue Mask = Op.getOperand(4);
19436 MVT VT = Op.getSimpleValueType();
19437 SDValue PassThru = SDValue();
19439 // set PassThru element
19440 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19441 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19442 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19447 SDValue Rnd = Op.getOperand(5);
19448 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19449 Op.getValueType(), Src1, Src2,
19451 Mask, PassThru, Subtarget, DAG);
19453 case TERLOG_OP_MASK:
19454 case TERLOG_OP_MASKZ: {
19455 SDValue Src1 = Op.getOperand(1);
19456 SDValue Src2 = Op.getOperand(2);
19457 SDValue Src3 = Op.getOperand(3);
19458 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19459 SDValue Mask = Op.getOperand(5);
19460 MVT VT = Op.getSimpleValueType();
19461 SDValue PassThru = Src1;
19462 // Set PassThru element.
19463 if (IntrData->Type == TERLOG_OP_MASKZ)
19464 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19466 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19467 Src1, Src2, Src3, Src4),
19468 Mask, PassThru, Subtarget, DAG);
19471 // ISD::FP_ROUND has a second argument that indicates if the truncation
19472 // does not change the value. Set it to 0 since it can change.
19473 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19474 DAG.getIntPtrConstant(0, dl));
19475 case CVTPD2PS_MASK: {
19476 SDValue Src = Op.getOperand(1);
19477 SDValue PassThru = Op.getOperand(2);
19478 SDValue Mask = Op.getOperand(3);
19479 // We add rounding mode to the Node when
19480 // - RM Opcode is specified and
19481 // - RM is not "current direction".
19482 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19483 if (IntrWithRoundingModeOpcode != 0) {
19484 SDValue Rnd = Op.getOperand(4);
19485 if (!isRoundModeCurDirection(Rnd)) {
19486 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19487 dl, Op.getValueType(),
19489 Mask, PassThru, Subtarget, DAG);
19492 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19493 // ISD::FP_ROUND has a second argument that indicates if the truncation
19494 // does not change the value. Set it to 0 since it can change.
19495 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19496 DAG.getIntPtrConstant(0, dl)),
19497 Mask, PassThru, Subtarget, DAG);
19500 // FPclass intrinsics with mask
19501 SDValue Src1 = Op.getOperand(1);
19502 MVT VT = Src1.getSimpleValueType();
19503 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19504 SDValue Imm = Op.getOperand(2);
19505 SDValue Mask = Op.getOperand(3);
19506 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19507 Mask.getSimpleValueType().getSizeInBits());
19508 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19509 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19510 DAG.getTargetConstant(0, dl, MaskVT),
19512 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19513 DAG.getUNDEF(BitcastVT), FPclassMask,
19514 DAG.getIntPtrConstant(0, dl));
19515 return DAG.getBitcast(Op.getValueType(), Res);
19518 SDValue Src1 = Op.getOperand(1);
19519 SDValue Imm = Op.getOperand(2);
19520 SDValue Mask = Op.getOperand(3);
19521 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
19522 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19523 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19524 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
19525 DAG.getIntPtrConstant(0, dl));
19528 case CMP_MASK_CC: {
19529 // Comparison intrinsics with masks.
19530 // Example of transformation:
19531 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19532 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19534 // (v8i1 (insert_subvector undef,
19535 // (v2i1 (and (PCMPEQM %a, %b),
19536 // (extract_subvector
19537 // (v8i1 (bitcast %mask)), 0))), 0))))
19538 MVT VT = Op.getOperand(1).getSimpleValueType();
19539 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19540 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19541 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19542 Mask.getSimpleValueType().getSizeInBits());
19544 if (IntrData->Type == CMP_MASK_CC) {
19545 SDValue CC = Op.getOperand(3);
19546 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19547 // We specify 2 possible opcodes for intrinsics with rounding modes.
19548 // First, we check if the intrinsic may have non-default rounding mode,
19549 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19550 if (IntrData->Opc1 != 0) {
19551 SDValue Rnd = Op.getOperand(5);
19552 if (!isRoundModeCurDirection(Rnd))
19553 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19554 Op.getOperand(2), CC, Rnd);
19556 //default rounding mode
19558 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19559 Op.getOperand(2), CC);
19562 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19563 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19566 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19567 DAG.getTargetConstant(0, dl,
19570 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19571 DAG.getUNDEF(BitcastVT), CmpMask,
19572 DAG.getIntPtrConstant(0, dl));
19573 return DAG.getBitcast(Op.getValueType(), Res);
19575 case CMP_MASK_SCALAR_CC: {
19576 SDValue Src1 = Op.getOperand(1);
19577 SDValue Src2 = Op.getOperand(2);
19578 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19579 SDValue Mask = Op.getOperand(4);
19582 if (IntrData->Opc1 != 0) {
19583 SDValue Rnd = Op.getOperand(5);
19584 if (!isRoundModeCurDirection(Rnd))
19585 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
19587 //default rounding mode
19589 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
19591 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19592 DAG.getTargetConstant(0, dl,
19595 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
19596 DAG.getIntPtrConstant(0, dl));
19598 case COMI: { // Comparison intrinsics
19599 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19600 SDValue LHS = Op.getOperand(1);
19601 SDValue RHS = Op.getOperand(2);
19602 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19603 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19606 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19607 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19608 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19609 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19612 case ISD::SETNE: { // (ZF = 1 or PF = 1)
19613 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19614 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19615 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19618 case ISD::SETGT: // (CF = 0 and ZF = 0)
19619 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19621 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19622 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19625 case ISD::SETGE: // CF = 0
19626 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19628 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19629 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19632 llvm_unreachable("Unexpected illegal condition!");
19634 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19636 case COMI_RM: { // Comparison intrinsics with Sae
19637 SDValue LHS = Op.getOperand(1);
19638 SDValue RHS = Op.getOperand(2);
19639 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19640 SDValue Sae = Op.getOperand(4);
19643 if (isRoundModeCurDirection(Sae))
19644 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
19645 DAG.getConstant(CondVal, dl, MVT::i8));
19647 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
19648 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19649 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
19650 DAG.getIntPtrConstant(0, dl));
19653 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19654 Op.getOperand(1), Op.getOperand(2), Subtarget,
19656 case COMPRESS_EXPAND_IN_REG: {
19657 SDValue Mask = Op.getOperand(3);
19658 SDValue DataToCompress = Op.getOperand(1);
19659 SDValue PassThru = Op.getOperand(2);
19660 if (isAllOnesConstant(Mask)) // return data as is
19661 return Op.getOperand(1);
19663 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19665 Mask, PassThru, Subtarget, DAG);
19668 SDValue Mask = Op.getOperand(1);
19669 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19670 Mask.getSimpleValueType().getSizeInBits());
19671 Mask = DAG.getBitcast(MaskVT, Mask);
19672 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19675 MVT VT = Op.getSimpleValueType();
19676 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19678 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19679 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19680 // Arguments should be swapped.
19681 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19682 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19684 return DAG.getBitcast(VT, Res);
19687 MVT VT = Op.getSimpleValueType();
19688 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19690 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19691 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19692 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19693 return DAG.getBitcast(VT, Res);
19696 case FIXUPIMMS_MASKZ:
19698 case FIXUPIMM_MASKZ:{
19699 SDValue Src1 = Op.getOperand(1);
19700 SDValue Src2 = Op.getOperand(2);
19701 SDValue Src3 = Op.getOperand(3);
19702 SDValue Imm = Op.getOperand(4);
19703 SDValue Mask = Op.getOperand(5);
19704 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19705 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19706 // We specify 2 possible modes for intrinsics, with/without rounding
19708 // First, we check if the intrinsic have rounding mode (7 operands),
19709 // if not, we set rounding mode to "current".
19711 if (Op.getNumOperands() == 7)
19712 Rnd = Op.getOperand(6);
19714 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19715 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19716 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19717 Src1, Src2, Src3, Imm, Rnd),
19718 Mask, Passthru, Subtarget, DAG);
19719 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19720 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19721 Src1, Src2, Src3, Imm, Rnd),
19722 Mask, Passthru, Subtarget, DAG);
19724 case CONVERT_TO_MASK: {
19725 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19726 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19727 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19729 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19731 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19732 DAG.getUNDEF(BitcastVT), CvtMask,
19733 DAG.getIntPtrConstant(0, dl));
19734 return DAG.getBitcast(Op.getValueType(), Res);
19736 case BRCST_SUBVEC_TO_VEC: {
19737 SDValue Src = Op.getOperand(1);
19738 SDValue Passthru = Op.getOperand(2);
19739 SDValue Mask = Op.getOperand(3);
19740 EVT resVT = Passthru.getValueType();
19741 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19742 DAG.getUNDEF(resVT), Src,
19743 DAG.getIntPtrConstant(0, dl));
19745 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19746 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19748 immVal = DAG.getConstant(0, dl, MVT::i8);
19749 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19750 subVec, subVec, immVal),
19751 Mask, Passthru, Subtarget, DAG);
19753 case BRCST32x2_TO_VEC: {
19754 SDValue Src = Op.getOperand(1);
19755 SDValue PassThru = Op.getOperand(2);
19756 SDValue Mask = Op.getOperand(3);
19758 assert((VT.getScalarType() == MVT::i32 ||
19759 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19760 //bitcast Src to packed 64
19761 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19762 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19763 Src = DAG.getBitcast(BitcastVT, Src);
19765 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19766 Mask, PassThru, Subtarget, DAG);
19774 default: return SDValue(); // Don't custom lower most intrinsics.
19776 case Intrinsic::x86_avx2_permd:
19777 case Intrinsic::x86_avx2_permps:
19778 // Operands intentionally swapped. Mask is last operand to intrinsic,
19779 // but second operand for node/instruction.
19780 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19781 Op.getOperand(2), Op.getOperand(1));
19783 // ptest and testp intrinsics. The intrinsic these come from are designed to
19784 // return an integer value, not just an instruction so lower it to the ptest
19785 // or testp pattern and a setcc for the result.
19786 case Intrinsic::x86_sse41_ptestz:
19787 case Intrinsic::x86_sse41_ptestc:
19788 case Intrinsic::x86_sse41_ptestnzc:
19789 case Intrinsic::x86_avx_ptestz_256:
19790 case Intrinsic::x86_avx_ptestc_256:
19791 case Intrinsic::x86_avx_ptestnzc_256:
19792 case Intrinsic::x86_avx_vtestz_ps:
19793 case Intrinsic::x86_avx_vtestc_ps:
19794 case Intrinsic::x86_avx_vtestnzc_ps:
19795 case Intrinsic::x86_avx_vtestz_pd:
19796 case Intrinsic::x86_avx_vtestc_pd:
19797 case Intrinsic::x86_avx_vtestnzc_pd:
19798 case Intrinsic::x86_avx_vtestz_ps_256:
19799 case Intrinsic::x86_avx_vtestc_ps_256:
19800 case Intrinsic::x86_avx_vtestnzc_ps_256:
19801 case Intrinsic::x86_avx_vtestz_pd_256:
19802 case Intrinsic::x86_avx_vtestc_pd_256:
19803 case Intrinsic::x86_avx_vtestnzc_pd_256: {
19804 bool IsTestPacked = false;
19805 X86::CondCode X86CC;
19807 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19808 case Intrinsic::x86_avx_vtestz_ps:
19809 case Intrinsic::x86_avx_vtestz_pd:
19810 case Intrinsic::x86_avx_vtestz_ps_256:
19811 case Intrinsic::x86_avx_vtestz_pd_256:
19812 IsTestPacked = true;
19814 case Intrinsic::x86_sse41_ptestz:
19815 case Intrinsic::x86_avx_ptestz_256:
19817 X86CC = X86::COND_E;
19819 case Intrinsic::x86_avx_vtestc_ps:
19820 case Intrinsic::x86_avx_vtestc_pd:
19821 case Intrinsic::x86_avx_vtestc_ps_256:
19822 case Intrinsic::x86_avx_vtestc_pd_256:
19823 IsTestPacked = true;
19825 case Intrinsic::x86_sse41_ptestc:
19826 case Intrinsic::x86_avx_ptestc_256:
19828 X86CC = X86::COND_B;
19830 case Intrinsic::x86_avx_vtestnzc_ps:
19831 case Intrinsic::x86_avx_vtestnzc_pd:
19832 case Intrinsic::x86_avx_vtestnzc_ps_256:
19833 case Intrinsic::x86_avx_vtestnzc_pd_256:
19834 IsTestPacked = true;
19836 case Intrinsic::x86_sse41_ptestnzc:
19837 case Intrinsic::x86_avx_ptestnzc_256:
19839 X86CC = X86::COND_A;
19843 SDValue LHS = Op.getOperand(1);
19844 SDValue RHS = Op.getOperand(2);
19845 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19846 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19847 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19848 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19850 case Intrinsic::x86_avx512_kortestz_w:
19851 case Intrinsic::x86_avx512_kortestc_w: {
19852 X86::CondCode X86CC =
19853 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19854 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19855 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19856 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19857 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19858 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19861 case Intrinsic::x86_avx512_knot_w: {
19862 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19863 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
19864 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19865 return DAG.getBitcast(MVT::i16, Res);
19868 case Intrinsic::x86_avx512_kandn_w: {
19869 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19870 // Invert LHS for the not.
19871 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
19872 DAG.getConstant(1, dl, MVT::v16i1));
19873 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19874 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
19875 return DAG.getBitcast(MVT::i16, Res);
19878 case Intrinsic::x86_avx512_kxnor_w: {
19879 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19880 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19881 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19882 // Invert result for the not.
19883 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
19884 DAG.getConstant(1, dl, MVT::v16i1));
19885 return DAG.getBitcast(MVT::i16, Res);
19888 case Intrinsic::x86_sse42_pcmpistria128:
19889 case Intrinsic::x86_sse42_pcmpestria128:
19890 case Intrinsic::x86_sse42_pcmpistric128:
19891 case Intrinsic::x86_sse42_pcmpestric128:
19892 case Intrinsic::x86_sse42_pcmpistrio128:
19893 case Intrinsic::x86_sse42_pcmpestrio128:
19894 case Intrinsic::x86_sse42_pcmpistris128:
19895 case Intrinsic::x86_sse42_pcmpestris128:
19896 case Intrinsic::x86_sse42_pcmpistriz128:
19897 case Intrinsic::x86_sse42_pcmpestriz128: {
19899 X86::CondCode X86CC;
19901 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
19902 case Intrinsic::x86_sse42_pcmpistria128:
19903 Opcode = X86ISD::PCMPISTRI;
19904 X86CC = X86::COND_A;
19906 case Intrinsic::x86_sse42_pcmpestria128:
19907 Opcode = X86ISD::PCMPESTRI;
19908 X86CC = X86::COND_A;
19910 case Intrinsic::x86_sse42_pcmpistric128:
19911 Opcode = X86ISD::PCMPISTRI;
19912 X86CC = X86::COND_B;
19914 case Intrinsic::x86_sse42_pcmpestric128:
19915 Opcode = X86ISD::PCMPESTRI;
19916 X86CC = X86::COND_B;
19918 case Intrinsic::x86_sse42_pcmpistrio128:
19919 Opcode = X86ISD::PCMPISTRI;
19920 X86CC = X86::COND_O;
19922 case Intrinsic::x86_sse42_pcmpestrio128:
19923 Opcode = X86ISD::PCMPESTRI;
19924 X86CC = X86::COND_O;
19926 case Intrinsic::x86_sse42_pcmpistris128:
19927 Opcode = X86ISD::PCMPISTRI;
19928 X86CC = X86::COND_S;
19930 case Intrinsic::x86_sse42_pcmpestris128:
19931 Opcode = X86ISD::PCMPESTRI;
19932 X86CC = X86::COND_S;
19934 case Intrinsic::x86_sse42_pcmpistriz128:
19935 Opcode = X86ISD::PCMPISTRI;
19936 X86CC = X86::COND_E;
19938 case Intrinsic::x86_sse42_pcmpestriz128:
19939 Opcode = X86ISD::PCMPESTRI;
19940 X86CC = X86::COND_E;
19943 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19944 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19945 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
19946 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
19947 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19950 case Intrinsic::x86_sse42_pcmpistri128:
19951 case Intrinsic::x86_sse42_pcmpestri128: {
19953 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
19954 Opcode = X86ISD::PCMPISTRI;
19956 Opcode = X86ISD::PCMPESTRI;
19958 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19959 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19960 return DAG.getNode(Opcode, dl, VTs, NewOps);
19963 case Intrinsic::eh_sjlj_lsda: {
19964 MachineFunction &MF = DAG.getMachineFunction();
19965 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19966 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19967 auto &Context = MF.getMMI().getContext();
19968 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
19969 Twine(MF.getFunctionNumber()));
19970 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
19973 case Intrinsic::x86_seh_lsda: {
19974 // Compute the symbol for the LSDA. We know it'll get emitted later.
19975 MachineFunction &MF = DAG.getMachineFunction();
19976 SDValue Op1 = Op.getOperand(1);
19977 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
19978 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
19979 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19981 // Generate a simple absolute symbol reference. This intrinsic is only
19982 // supported on 32-bit Windows, which isn't PIC.
19983 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
19984 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
19987 case Intrinsic::x86_seh_recoverfp: {
19988 SDValue FnOp = Op.getOperand(1);
19989 SDValue IncomingFPOp = Op.getOperand(2);
19990 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
19991 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
19993 report_fatal_error(
19994 "llvm.x86.seh.recoverfp must take a function as the first argument");
19995 return recoverFramePointer(DAG, Fn, IncomingFPOp);
19998 case Intrinsic::localaddress: {
19999 // Returns one of the stack, base, or frame pointer registers, depending on
20000 // which is used to reference local variables.
20001 MachineFunction &MF = DAG.getMachineFunction();
20002 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20004 if (RegInfo->hasBasePointer(MF))
20005 Reg = RegInfo->getBaseRegister();
20006 else // This function handles the SP or FP case.
20007 Reg = RegInfo->getPtrSizedFrameRegister(MF);
20008 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20013 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20014 SDValue Src, SDValue Mask, SDValue Base,
20015 SDValue Index, SDValue ScaleOp, SDValue Chain,
20016 const X86Subtarget &Subtarget) {
20018 auto *C = cast<ConstantSDNode>(ScaleOp);
20019 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20020 EVT MaskVT = Mask.getValueType();
20021 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20022 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20023 SDValue Segment = DAG.getRegister(0, MVT::i32);
20024 // If source is undef or we know it won't be used, use a zero vector
20025 // to break register dependency.
20026 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20027 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20028 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20029 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20030 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20031 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20032 return DAG.getMergeValues(RetOps, dl);
20035 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20036 SDValue Src, SDValue Mask, SDValue Base,
20037 SDValue Index, SDValue ScaleOp, SDValue Chain,
20038 const X86Subtarget &Subtarget) {
20040 auto *C = cast<ConstantSDNode>(ScaleOp);
20041 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20042 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20043 Index.getSimpleValueType().getVectorNumElements());
20045 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20046 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20047 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20048 SDValue Segment = DAG.getRegister(0, MVT::i32);
20049 // If source is undef or we know it won't be used, use a zero vector
20050 // to break register dependency.
20051 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20052 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20053 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20054 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20055 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20056 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20057 return DAG.getMergeValues(RetOps, dl);
20060 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20061 SDValue Src, SDValue Mask, SDValue Base,
20062 SDValue Index, SDValue ScaleOp, SDValue Chain,
20063 const X86Subtarget &Subtarget) {
20065 auto *C = cast<ConstantSDNode>(ScaleOp);
20066 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20067 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20068 SDValue Segment = DAG.getRegister(0, MVT::i32);
20069 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20070 Index.getSimpleValueType().getVectorNumElements());
20072 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20073 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20074 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20075 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20076 return SDValue(Res, 1);
20079 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20080 SDValue Mask, SDValue Base, SDValue Index,
20081 SDValue ScaleOp, SDValue Chain,
20082 const X86Subtarget &Subtarget) {
20084 auto *C = cast<ConstantSDNode>(ScaleOp);
20085 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20086 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20087 SDValue Segment = DAG.getRegister(0, MVT::i32);
20089 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20090 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20091 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20092 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20093 return SDValue(Res, 0);
20096 /// Handles the lowering of builtin intrinsic that return the value
20097 /// of the extended control register.
20098 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20100 const X86Subtarget &Subtarget,
20101 SmallVectorImpl<SDValue> &Results) {
20102 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20103 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20106 // The ECX register is used to select the index of the XCR register to
20109 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20110 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20111 Chain = SDValue(N1, 0);
20113 // Reads the content of XCR and returns it in registers EDX:EAX.
20114 if (Subtarget.is64Bit()) {
20115 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20116 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20119 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20120 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20123 Chain = HI.getValue(1);
20125 if (Subtarget.is64Bit()) {
20126 // Merge the two 32-bit values into a 64-bit one..
20127 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20128 DAG.getConstant(32, DL, MVT::i8));
20129 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20130 Results.push_back(Chain);
20134 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20135 SDValue Ops[] = { LO, HI };
20136 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20137 Results.push_back(Pair);
20138 Results.push_back(Chain);
20141 /// Handles the lowering of builtin intrinsics that read performance monitor
20142 /// counters (x86_rdpmc).
20143 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20145 const X86Subtarget &Subtarget,
20146 SmallVectorImpl<SDValue> &Results) {
20147 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20148 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20151 // The ECX register is used to select the index of the performance counter
20153 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20155 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20157 // Reads the content of a 64-bit performance counter and returns it in the
20158 // registers EDX:EAX.
20159 if (Subtarget.is64Bit()) {
20160 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20161 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20164 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20165 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20168 Chain = HI.getValue(1);
20170 if (Subtarget.is64Bit()) {
20171 // The EAX register is loaded with the low-order 32 bits. The EDX register
20172 // is loaded with the supported high-order bits of the counter.
20173 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20174 DAG.getConstant(32, DL, MVT::i8));
20175 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20176 Results.push_back(Chain);
20180 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20181 SDValue Ops[] = { LO, HI };
20182 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20183 Results.push_back(Pair);
20184 Results.push_back(Chain);
20187 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20188 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20189 /// READCYCLECOUNTER nodes.
20190 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20192 const X86Subtarget &Subtarget,
20193 SmallVectorImpl<SDValue> &Results) {
20194 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20195 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20198 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20199 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20200 // and the EAX register is loaded with the low-order 32 bits.
20201 if (Subtarget.is64Bit()) {
20202 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20203 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20206 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20207 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20210 SDValue Chain = HI.getValue(1);
20212 if (Opcode == X86ISD::RDTSCP_DAG) {
20213 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20215 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20216 // the ECX register. Add 'ecx' explicitly to the chain.
20217 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20219 // Explicitly store the content of ECX at the location passed in input
20220 // to the 'rdtscp' intrinsic.
20221 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20222 MachinePointerInfo());
20225 if (Subtarget.is64Bit()) {
20226 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20227 // the EAX register is loaded with the low-order 32 bits.
20228 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20229 DAG.getConstant(32, DL, MVT::i8));
20230 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20231 Results.push_back(Chain);
20235 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20236 SDValue Ops[] = { LO, HI };
20237 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20238 Results.push_back(Pair);
20239 Results.push_back(Chain);
20242 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20243 SelectionDAG &DAG) {
20244 SmallVector<SDValue, 2> Results;
20246 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20248 return DAG.getMergeValues(Results, DL);
20251 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20252 MachineFunction &MF = DAG.getMachineFunction();
20253 SDValue Chain = Op.getOperand(0);
20254 SDValue RegNode = Op.getOperand(2);
20255 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20257 report_fatal_error("EH registrations only live in functions using WinEH");
20259 // Cast the operand to an alloca, and remember the frame index.
20260 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20262 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20263 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20265 // Return the chain operand without making any DAG nodes.
20269 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20270 MachineFunction &MF = DAG.getMachineFunction();
20271 SDValue Chain = Op.getOperand(0);
20272 SDValue EHGuard = Op.getOperand(2);
20273 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20275 report_fatal_error("EHGuard only live in functions using WinEH");
20277 // Cast the operand to an alloca, and remember the frame index.
20278 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20280 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20281 EHInfo->EHGuardFrameIndex = FINode->getIndex();
20283 // Return the chain operand without making any DAG nodes.
20287 /// Emit Truncating Store with signed or unsigned saturation.
20289 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20290 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20291 SelectionDAG &DAG) {
20293 SDVTList VTs = DAG.getVTList(MVT::Other);
20294 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20295 SDValue Ops[] = { Chain, Val, Ptr, Undef };
20297 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20298 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20301 /// Emit Masked Truncating Store with signed or unsigned saturation.
20303 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20304 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20305 MachineMemOperand *MMO, SelectionDAG &DAG) {
20307 SDVTList VTs = DAG.getVTList(MVT::Other);
20308 SDValue Ops[] = { Chain, Ptr, Mask, Val };
20310 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20311 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20314 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20315 SelectionDAG &DAG) {
20316 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20318 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20321 case llvm::Intrinsic::x86_seh_ehregnode:
20322 return MarkEHRegistrationNode(Op, DAG);
20323 case llvm::Intrinsic::x86_seh_ehguard:
20324 return MarkEHGuard(Op, DAG);
20325 case llvm::Intrinsic::x86_flags_read_u32:
20326 case llvm::Intrinsic::x86_flags_read_u64:
20327 case llvm::Intrinsic::x86_flags_write_u32:
20328 case llvm::Intrinsic::x86_flags_write_u64: {
20329 // We need a frame pointer because this will get lowered to a PUSH/POP
20331 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20332 MFI.setHasCopyImplyingStackAdjustment(true);
20333 // Don't do anything here, we will expand these intrinsics out later
20334 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20337 case Intrinsic::x86_lwpins32:
20338 case Intrinsic::x86_lwpins64: {
20340 SDValue Chain = Op->getOperand(0);
20341 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20343 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20344 Op->getOperand(3), Op->getOperand(4));
20345 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20346 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20347 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20348 LwpIns.getValue(1));
20355 switch(IntrData->Type) {
20356 default: llvm_unreachable("Unknown Intrinsic Type");
20359 // Emit the node with the right value type.
20360 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20361 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20363 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20364 // Otherwise return the value from Rand, which is always 0, casted to i32.
20365 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20366 DAG.getConstant(1, dl, Op->getValueType(1)),
20367 DAG.getConstant(X86::COND_B, dl, MVT::i32),
20368 SDValue(Result.getNode(), 1) };
20369 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20370 DAG.getVTList(Op->getValueType(1), MVT::Glue),
20373 // Return { result, isValid, chain }.
20374 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20375 SDValue(Result.getNode(), 2));
20377 case GATHER_AVX2: {
20378 SDValue Chain = Op.getOperand(0);
20379 SDValue Src = Op.getOperand(2);
20380 SDValue Base = Op.getOperand(3);
20381 SDValue Index = Op.getOperand(4);
20382 SDValue Mask = Op.getOperand(5);
20383 SDValue Scale = Op.getOperand(6);
20384 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20385 Scale, Chain, Subtarget);
20388 //gather(v1, mask, index, base, scale);
20389 SDValue Chain = Op.getOperand(0);
20390 SDValue Src = Op.getOperand(2);
20391 SDValue Base = Op.getOperand(3);
20392 SDValue Index = Op.getOperand(4);
20393 SDValue Mask = Op.getOperand(5);
20394 SDValue Scale = Op.getOperand(6);
20395 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20399 //scatter(base, mask, index, v1, scale);
20400 SDValue Chain = Op.getOperand(0);
20401 SDValue Base = Op.getOperand(2);
20402 SDValue Mask = Op.getOperand(3);
20403 SDValue Index = Op.getOperand(4);
20404 SDValue Src = Op.getOperand(5);
20405 SDValue Scale = Op.getOperand(6);
20406 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20407 Scale, Chain, Subtarget);
20410 SDValue Hint = Op.getOperand(6);
20411 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20412 assert((HintVal == 2 || HintVal == 3) &&
20413 "Wrong prefetch hint in intrinsic: should be 2 or 3");
20414 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20415 SDValue Chain = Op.getOperand(0);
20416 SDValue Mask = Op.getOperand(2);
20417 SDValue Index = Op.getOperand(3);
20418 SDValue Base = Op.getOperand(4);
20419 SDValue Scale = Op.getOperand(5);
20420 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20423 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20425 SmallVector<SDValue, 2> Results;
20426 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20428 return DAG.getMergeValues(Results, dl);
20430 // Read Performance Monitoring Counters.
20432 SmallVector<SDValue, 2> Results;
20433 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20434 return DAG.getMergeValues(Results, dl);
20436 // Get Extended Control Register.
20438 SmallVector<SDValue, 2> Results;
20439 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20440 return DAG.getMergeValues(Results, dl);
20442 // XTEST intrinsics.
20444 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20445 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20447 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20448 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20449 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20450 Ret, SDValue(InTrans.getNode(), 1));
20454 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20455 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
20456 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20457 DAG.getConstant(-1, dl, MVT::i8));
20458 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20459 Op.getOperand(4), GenCF.getValue(1));
20460 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20461 Op.getOperand(5), MachinePointerInfo());
20462 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20463 SDValue Results[] = { SetCC, Store };
20464 return DAG.getMergeValues(Results, dl);
20466 case COMPRESS_TO_MEM: {
20467 SDValue Mask = Op.getOperand(4);
20468 SDValue DataToCompress = Op.getOperand(3);
20469 SDValue Addr = Op.getOperand(2);
20470 SDValue Chain = Op.getOperand(0);
20471 MVT VT = DataToCompress.getSimpleValueType();
20473 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20474 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20476 if (isAllOnesConstant(Mask)) // return just a store
20477 return DAG.getStore(Chain, dl, DataToCompress, Addr,
20478 MemIntr->getMemOperand());
20480 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20481 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20483 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20484 MemIntr->getMemOperand(),
20485 false /* truncating */, true /* compressing */);
20487 case TRUNCATE_TO_MEM_VI8:
20488 case TRUNCATE_TO_MEM_VI16:
20489 case TRUNCATE_TO_MEM_VI32: {
20490 SDValue Mask = Op.getOperand(4);
20491 SDValue DataToTruncate = Op.getOperand(3);
20492 SDValue Addr = Op.getOperand(2);
20493 SDValue Chain = Op.getOperand(0);
20495 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20496 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20498 EVT MemVT = MemIntr->getMemoryVT();
20500 uint16_t TruncationOp = IntrData->Opc0;
20501 switch (TruncationOp) {
20502 case X86ISD::VTRUNC: {
20503 if (isAllOnesConstant(Mask)) // return just a truncate store
20504 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20505 MemIntr->getMemOperand());
20507 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20508 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20510 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20511 MemIntr->getMemOperand(), true /* truncating */);
20513 case X86ISD::VTRUNCUS:
20514 case X86ISD::VTRUNCS: {
20515 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20516 if (isAllOnesConstant(Mask))
20517 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20518 MemIntr->getMemOperand(), DAG);
20520 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20521 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20523 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20524 VMask, MemVT, MemIntr->getMemOperand(), DAG);
20527 llvm_unreachable("Unsupported truncstore intrinsic");
20531 case EXPAND_FROM_MEM: {
20532 SDValue Mask = Op.getOperand(4);
20533 SDValue PassThru = Op.getOperand(3);
20534 SDValue Addr = Op.getOperand(2);
20535 SDValue Chain = Op.getOperand(0);
20536 MVT VT = Op.getSimpleValueType();
20538 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20539 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20541 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20542 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20543 if (X86::isZeroNode(Mask))
20544 return DAG.getUNDEF(VT);
20546 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20547 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20548 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20549 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20550 true /* expanding */);
20555 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20556 SelectionDAG &DAG) const {
20557 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20558 MFI.setReturnAddressIsTaken(true);
20560 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20563 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20565 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20568 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20569 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20570 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20571 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20572 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20573 MachinePointerInfo());
20576 // Just load the return address.
20577 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20578 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20579 MachinePointerInfo());
20582 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20583 SelectionDAG &DAG) const {
20584 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20585 return getReturnAddressFrameIndex(DAG);
20588 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20589 MachineFunction &MF = DAG.getMachineFunction();
20590 MachineFrameInfo &MFI = MF.getFrameInfo();
20591 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20592 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20593 EVT VT = Op.getValueType();
20595 MFI.setFrameAddressIsTaken(true);
20597 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20598 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
20599 // is not possible to crawl up the stack without looking at the unwind codes
20601 int FrameAddrIndex = FuncInfo->getFAIndex();
20602 if (!FrameAddrIndex) {
20603 // Set up a frame object for the return address.
20604 unsigned SlotSize = RegInfo->getSlotSize();
20605 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20606 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20607 FuncInfo->setFAIndex(FrameAddrIndex);
20609 return DAG.getFrameIndex(FrameAddrIndex, VT);
20612 unsigned FrameReg =
20613 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20614 SDLoc dl(Op); // FIXME probably not meaningful
20615 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20616 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20617 (FrameReg == X86::EBP && VT == MVT::i32)) &&
20618 "Invalid Frame Register!");
20619 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20621 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20622 MachinePointerInfo());
20626 // FIXME? Maybe this could be a TableGen attribute on some registers and
20627 // this table could be generated automatically from RegInfo.
20628 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20629 SelectionDAG &DAG) const {
20630 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20631 const MachineFunction &MF = DAG.getMachineFunction();
20633 unsigned Reg = StringSwitch<unsigned>(RegName)
20634 .Case("esp", X86::ESP)
20635 .Case("rsp", X86::RSP)
20636 .Case("ebp", X86::EBP)
20637 .Case("rbp", X86::RBP)
20640 if (Reg == X86::EBP || Reg == X86::RBP) {
20641 if (!TFI.hasFP(MF))
20642 report_fatal_error("register " + StringRef(RegName) +
20643 " is allocatable: function has no frame pointer");
20646 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20647 unsigned FrameReg =
20648 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20649 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20650 "Invalid Frame Register!");
20658 report_fatal_error("Invalid register name global variable");
20661 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20662 SelectionDAG &DAG) const {
20663 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20664 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20667 unsigned X86TargetLowering::getExceptionPointerRegister(
20668 const Constant *PersonalityFn) const {
20669 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20670 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20672 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20675 unsigned X86TargetLowering::getExceptionSelectorRegister(
20676 const Constant *PersonalityFn) const {
20677 // Funclet personalities don't use selectors (the runtime does the selection).
20678 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20679 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20682 bool X86TargetLowering::needsFixedCatchObjects() const {
20683 return Subtarget.isTargetWin64();
20686 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20687 SDValue Chain = Op.getOperand(0);
20688 SDValue Offset = Op.getOperand(1);
20689 SDValue Handler = Op.getOperand(2);
20692 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20693 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20694 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20695 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20696 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20697 "Invalid Frame Register!");
20698 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20699 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20701 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20702 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20704 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20705 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20706 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20708 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20709 DAG.getRegister(StoreAddrReg, PtrVT));
20712 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20713 SelectionDAG &DAG) const {
20715 // If the subtarget is not 64bit, we may need the global base reg
20716 // after isel expand pseudo, i.e., after CGBR pass ran.
20717 // Therefore, ask for the GlobalBaseReg now, so that the pass
20718 // inserts the code for us in case we need it.
20719 // Otherwise, we will end up in a situation where we will
20720 // reference a virtual register that is not defined!
20721 if (!Subtarget.is64Bit()) {
20722 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20723 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20725 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20726 DAG.getVTList(MVT::i32, MVT::Other),
20727 Op.getOperand(0), Op.getOperand(1));
20730 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20731 SelectionDAG &DAG) const {
20733 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20734 Op.getOperand(0), Op.getOperand(1));
20737 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20738 SelectionDAG &DAG) const {
20740 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20744 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20745 return Op.getOperand(0);
20748 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20749 SelectionDAG &DAG) const {
20750 SDValue Root = Op.getOperand(0);
20751 SDValue Trmp = Op.getOperand(1); // trampoline
20752 SDValue FPtr = Op.getOperand(2); // nested function
20753 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20756 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20757 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20759 if (Subtarget.is64Bit()) {
20760 SDValue OutChains[6];
20762 // Large code-model.
20763 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20764 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20766 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20767 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20769 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20771 // Load the pointer to the nested function into R11.
20772 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20773 SDValue Addr = Trmp;
20774 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20775 Addr, MachinePointerInfo(TrmpAddr));
20777 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20778 DAG.getConstant(2, dl, MVT::i64));
20780 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20781 /* Alignment = */ 2);
20783 // Load the 'nest' parameter value into R10.
20784 // R10 is specified in X86CallingConv.td
20785 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20786 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20787 DAG.getConstant(10, dl, MVT::i64));
20788 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20789 Addr, MachinePointerInfo(TrmpAddr, 10));
20791 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20792 DAG.getConstant(12, dl, MVT::i64));
20794 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20795 /* Alignment = */ 2);
20797 // Jump to the nested function.
20798 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20799 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20800 DAG.getConstant(20, dl, MVT::i64));
20801 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20802 Addr, MachinePointerInfo(TrmpAddr, 20));
20804 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20805 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20806 DAG.getConstant(22, dl, MVT::i64));
20807 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20808 Addr, MachinePointerInfo(TrmpAddr, 22));
20810 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20812 const Function *Func =
20813 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20814 CallingConv::ID CC = Func->getCallingConv();
20819 llvm_unreachable("Unsupported calling convention");
20820 case CallingConv::C:
20821 case CallingConv::X86_StdCall: {
20822 // Pass 'nest' parameter in ECX.
20823 // Must be kept in sync with X86CallingConv.td
20824 NestReg = X86::ECX;
20826 // Check that ECX wasn't needed by an 'inreg' parameter.
20827 FunctionType *FTy = Func->getFunctionType();
20828 const AttributeList &Attrs = Func->getAttributes();
20830 if (!Attrs.isEmpty() && !Func->isVarArg()) {
20831 unsigned InRegCount = 0;
20834 for (FunctionType::param_iterator I = FTy->param_begin(),
20835 E = FTy->param_end(); I != E; ++I, ++Idx)
20836 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20837 auto &DL = DAG.getDataLayout();
20838 // FIXME: should only count parameters that are lowered to integers.
20839 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20842 if (InRegCount > 2) {
20843 report_fatal_error("Nest register in use - reduce number of inreg"
20849 case CallingConv::X86_FastCall:
20850 case CallingConv::X86_ThisCall:
20851 case CallingConv::Fast:
20852 // Pass 'nest' parameter in EAX.
20853 // Must be kept in sync with X86CallingConv.td
20854 NestReg = X86::EAX;
20858 SDValue OutChains[4];
20859 SDValue Addr, Disp;
20861 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20862 DAG.getConstant(10, dl, MVT::i32));
20863 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
20865 // This is storing the opcode for MOV32ri.
20866 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
20867 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
20869 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
20870 Trmp, MachinePointerInfo(TrmpAddr));
20872 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20873 DAG.getConstant(1, dl, MVT::i32));
20875 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
20876 /* Alignment = */ 1);
20878 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
20879 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20880 DAG.getConstant(5, dl, MVT::i32));
20881 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
20882 Addr, MachinePointerInfo(TrmpAddr, 5),
20883 /* Alignment = */ 1);
20885 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20886 DAG.getConstant(6, dl, MVT::i32));
20888 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
20889 /* Alignment = */ 1);
20891 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20895 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
20896 SelectionDAG &DAG) const {
20898 The rounding mode is in bits 11:10 of FPSR, and has the following
20900 00 Round to nearest
20905 FLT_ROUNDS, on the other hand, expects the following:
20912 To perform the conversion, we do:
20913 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
20916 MachineFunction &MF = DAG.getMachineFunction();
20917 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20918 unsigned StackAlignment = TFI.getStackAlignment();
20919 MVT VT = Op.getSimpleValueType();
20922 // Save FP Control Word to stack slot
20923 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
20924 SDValue StackSlot =
20925 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
20927 MachineMemOperand *MMO =
20928 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
20929 MachineMemOperand::MOStore, 2, 2);
20931 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
20932 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
20933 DAG.getVTList(MVT::Other),
20934 Ops, MVT::i16, MMO);
20936 // Load FP Control Word from stack slot
20938 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
20940 // Transform as necessary
20942 DAG.getNode(ISD::SRL, DL, MVT::i16,
20943 DAG.getNode(ISD::AND, DL, MVT::i16,
20944 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
20945 DAG.getConstant(11, DL, MVT::i8));
20947 DAG.getNode(ISD::SRL, DL, MVT::i16,
20948 DAG.getNode(ISD::AND, DL, MVT::i16,
20949 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
20950 DAG.getConstant(9, DL, MVT::i8));
20953 DAG.getNode(ISD::AND, DL, MVT::i16,
20954 DAG.getNode(ISD::ADD, DL, MVT::i16,
20955 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
20956 DAG.getConstant(1, DL, MVT::i16)),
20957 DAG.getConstant(3, DL, MVT::i16));
20959 return DAG.getNode((VT.getSizeInBits() < 16 ?
20960 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
20963 // Split an unary integer op into 2 half sized ops.
20964 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
20965 MVT VT = Op.getSimpleValueType();
20966 unsigned NumElems = VT.getVectorNumElements();
20967 unsigned SizeInBits = VT.getSizeInBits();
20969 // Extract the Lo/Hi vectors
20971 SDValue Src = Op.getOperand(0);
20972 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
20973 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
20975 MVT EltVT = VT.getVectorElementType();
20976 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
20977 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20978 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
20979 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
20982 // Decompose 256-bit ops into smaller 128-bit ops.
20983 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
20984 assert(Op.getSimpleValueType().is256BitVector() &&
20985 Op.getSimpleValueType().isInteger() &&
20986 "Only handle AVX 256-bit vector integer operation");
20987 return LowerVectorIntUnary(Op, DAG);
20990 // Decompose 512-bit ops into smaller 256-bit ops.
20991 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
20992 assert(Op.getSimpleValueType().is512BitVector() &&
20993 Op.getSimpleValueType().isInteger() &&
20994 "Only handle AVX 512-bit vector integer operation");
20995 return LowerVectorIntUnary(Op, DAG);
20998 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21000 // i8/i16 vector implemented using dword LZCNT vector instruction
21001 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21002 // split the vector, perform operation on it's Lo a Hi part and
21003 // concatenate the results.
21004 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21005 assert(Op.getOpcode() == ISD::CTLZ);
21007 MVT VT = Op.getSimpleValueType();
21008 MVT EltVT = VT.getVectorElementType();
21009 unsigned NumElems = VT.getVectorNumElements();
21011 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21012 "Unsupported element type");
21014 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21016 return LowerVectorIntUnary(Op, DAG);
21018 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21019 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21020 "Unsupported value type for operation");
21022 // Use native supported vector instruction vplzcntd.
21023 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21024 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21025 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21026 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21028 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21031 // Lower CTLZ using a PSHUFB lookup table implementation.
21032 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21033 const X86Subtarget &Subtarget,
21034 SelectionDAG &DAG) {
21035 MVT VT = Op.getSimpleValueType();
21036 int NumElts = VT.getVectorNumElements();
21037 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21038 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21040 // Per-nibble leading zero PSHUFB lookup table.
21041 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21042 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21043 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21044 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21046 SmallVector<SDValue, 64> LUTVec;
21047 for (int i = 0; i < NumBytes; ++i)
21048 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21049 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21051 // Begin by bitcasting the input to byte vector, then split those bytes
21052 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21053 // If the hi input nibble is zero then we add both results together, otherwise
21054 // we just take the hi result (by masking the lo result to zero before the
21056 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21057 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21059 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21060 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21061 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21062 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21063 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21065 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21066 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21067 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21068 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21070 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21071 // of the current vector width in the same way we did for the nibbles.
21072 // If the upper half of the input element is zero then add the halves'
21073 // leading zero counts together, otherwise just use the upper half's.
21074 // Double the width of the result until we are at target width.
21075 while (CurrVT != VT) {
21076 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21077 int CurrNumElts = CurrVT.getVectorNumElements();
21078 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21079 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21080 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21082 // Check if the upper half of the input element is zero.
21083 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21084 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21085 HiZ = DAG.getBitcast(NextVT, HiZ);
21087 // Move the upper/lower halves to the lower bits as we'll be extending to
21088 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21090 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21091 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21092 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21093 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21094 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21101 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21102 const X86Subtarget &Subtarget,
21103 SelectionDAG &DAG) {
21104 MVT VT = Op.getSimpleValueType();
21106 if (Subtarget.hasCDI())
21107 return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21109 // Decompose 256-bit ops into smaller 128-bit ops.
21110 if (VT.is256BitVector() && !Subtarget.hasInt256())
21111 return Lower256IntUnary(Op, DAG);
21113 // Decompose 512-bit ops into smaller 256-bit ops.
21114 if (VT.is512BitVector() && !Subtarget.hasBWI())
21115 return Lower512IntUnary(Op, DAG);
21117 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21118 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21121 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21122 SelectionDAG &DAG) {
21123 MVT VT = Op.getSimpleValueType();
21125 unsigned NumBits = VT.getSizeInBits();
21127 unsigned Opc = Op.getOpcode();
21130 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21132 Op = Op.getOperand(0);
21133 if (VT == MVT::i8) {
21134 // Zero extend to i32 since there is not an i8 bsr.
21136 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21139 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21140 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21141 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21143 if (Opc == ISD::CTLZ) {
21144 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21147 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21148 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21151 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21154 // Finally xor with NumBits-1.
21155 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21156 DAG.getConstant(NumBits - 1, dl, OpVT));
21159 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21163 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21164 MVT VT = Op.getSimpleValueType();
21165 unsigned NumBits = VT.getScalarSizeInBits();
21168 if (VT.isVector()) {
21169 SDValue N0 = Op.getOperand(0);
21170 SDValue Zero = DAG.getConstant(0, dl, VT);
21172 // lsb(x) = (x & -x)
21173 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21174 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21176 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21177 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21178 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21179 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21180 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21183 // cttz(x) = ctpop(lsb - 1)
21184 SDValue One = DAG.getConstant(1, dl, VT);
21185 return DAG.getNode(ISD::CTPOP, dl, VT,
21186 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21189 assert(Op.getOpcode() == ISD::CTTZ &&
21190 "Only scalar CTTZ requires custom lowering");
21192 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21193 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21194 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21196 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21199 DAG.getConstant(NumBits, dl, VT),
21200 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21203 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21206 /// Break a 256-bit integer operation into two new 128-bit ones and then
21207 /// concatenate the result back.
21208 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21209 MVT VT = Op.getSimpleValueType();
21211 assert(VT.is256BitVector() && VT.isInteger() &&
21212 "Unsupported value type for operation");
21214 unsigned NumElems = VT.getVectorNumElements();
21217 // Extract the LHS vectors
21218 SDValue LHS = Op.getOperand(0);
21219 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21220 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21222 // Extract the RHS vectors
21223 SDValue RHS = Op.getOperand(1);
21224 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21225 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21227 MVT EltVT = VT.getVectorElementType();
21228 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21230 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21231 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21232 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21235 /// Break a 512-bit integer operation into two new 256-bit ones and then
21236 /// concatenate the result back.
21237 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21238 MVT VT = Op.getSimpleValueType();
21240 assert(VT.is512BitVector() && VT.isInteger() &&
21241 "Unsupported value type for operation");
21243 unsigned NumElems = VT.getVectorNumElements();
21246 // Extract the LHS vectors
21247 SDValue LHS = Op.getOperand(0);
21248 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21249 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21251 // Extract the RHS vectors
21252 SDValue RHS = Op.getOperand(1);
21253 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21254 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21256 MVT EltVT = VT.getVectorElementType();
21257 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21259 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21260 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21261 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21264 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21265 MVT VT = Op.getSimpleValueType();
21266 if (VT.getScalarType() == MVT::i1)
21267 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21268 Op.getOperand(0), Op.getOperand(1));
21269 assert(Op.getSimpleValueType().is256BitVector() &&
21270 Op.getSimpleValueType().isInteger() &&
21271 "Only handle AVX 256-bit vector integer operation");
21272 return Lower256IntArith(Op, DAG);
21275 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21276 assert(Op.getSimpleValueType().is256BitVector() &&
21277 Op.getSimpleValueType().isInteger() &&
21278 "Only handle AVX 256-bit vector integer operation");
21279 return Lower256IntUnary(Op, DAG);
21282 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21283 assert(Op.getSimpleValueType().is256BitVector() &&
21284 Op.getSimpleValueType().isInteger() &&
21285 "Only handle AVX 256-bit vector integer operation");
21286 return Lower256IntArith(Op, DAG);
21289 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21290 SelectionDAG &DAG) {
21292 MVT VT = Op.getSimpleValueType();
21294 if (VT.getScalarType() == MVT::i1)
21295 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21297 // Decompose 256-bit ops into smaller 128-bit ops.
21298 if (VT.is256BitVector() && !Subtarget.hasInt256())
21299 return Lower256IntArith(Op, DAG);
21301 SDValue A = Op.getOperand(0);
21302 SDValue B = Op.getOperand(1);
21304 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21305 // vector pairs, multiply and truncate.
21306 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21307 if (Subtarget.hasInt256()) {
21308 // For 512-bit vectors, split into 256-bit vectors to allow the
21309 // sign-extension to occur.
21310 if (VT == MVT::v64i8)
21311 return Lower512IntArith(Op, DAG);
21313 // For 256-bit vectors, split into 128-bit vectors to allow the
21314 // sign-extension to occur. We don't need this on AVX512BW as we can
21315 // safely sign-extend to v32i16.
21316 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21317 return Lower256IntArith(Op, DAG);
21319 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21320 return DAG.getNode(
21321 ISD::TRUNCATE, dl, VT,
21322 DAG.getNode(ISD::MUL, dl, ExVT,
21323 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21324 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21327 assert(VT == MVT::v16i8 &&
21328 "Pre-AVX2 support only supports v16i8 multiplication");
21329 MVT ExVT = MVT::v8i16;
21331 // Extract the lo parts and sign extend to i16
21333 if (Subtarget.hasSSE41()) {
21334 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21335 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21337 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21338 -1, 4, -1, 5, -1, 6, -1, 7};
21339 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21340 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21341 ALo = DAG.getBitcast(ExVT, ALo);
21342 BLo = DAG.getBitcast(ExVT, BLo);
21343 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21344 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21347 // Extract the hi parts and sign extend to i16
21349 if (Subtarget.hasSSE41()) {
21350 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21351 -1, -1, -1, -1, -1, -1, -1, -1};
21352 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21353 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21354 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21355 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21357 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21358 -1, 12, -1, 13, -1, 14, -1, 15};
21359 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21360 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21361 AHi = DAG.getBitcast(ExVT, AHi);
21362 BHi = DAG.getBitcast(ExVT, BHi);
21363 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21364 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21367 // Multiply, mask the lower 8bits of the lo/hi results and pack
21368 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21369 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21370 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21371 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21372 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21375 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21376 if (VT == MVT::v4i32) {
21377 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21378 "Should not custom lower when pmuldq is available!");
21380 // Extract the odd parts.
21381 static const int UnpackMask[] = { 1, -1, 3, -1 };
21382 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21383 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21385 // Multiply the even parts.
21386 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21387 // Now multiply odd parts.
21388 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21390 Evens = DAG.getBitcast(VT, Evens);
21391 Odds = DAG.getBitcast(VT, Odds);
21393 // Merge the two vectors back together with a shuffle. This expands into 2
21395 static const int ShufMask[] = { 0, 4, 2, 6 };
21396 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21399 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21400 "Only know how to lower V2I64/V4I64/V8I64 multiply");
21402 // 32-bit vector types used for MULDQ/MULUDQ.
21403 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21405 // MULDQ returns the 64-bit result of the signed multiplication of the lower
21406 // 32-bits. We can lower with this if the sign bits stretch that far.
21407 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21408 DAG.ComputeNumSignBits(B) > 32) {
21409 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21410 DAG.getBitcast(MulVT, B));
21413 // Ahi = psrlqi(a, 32);
21414 // Bhi = psrlqi(b, 32);
21416 // AloBlo = pmuludq(a, b);
21417 // AloBhi = pmuludq(a, Bhi);
21418 // AhiBlo = pmuludq(Ahi, b);
21420 // Hi = psllqi(AloBhi + AhiBlo, 32);
21421 // return AloBlo + Hi;
21422 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21423 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21424 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21426 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21427 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21428 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21430 // Bit cast to 32-bit vectors for MULUDQ.
21431 SDValue Alo = DAG.getBitcast(MulVT, A);
21432 SDValue Blo = DAG.getBitcast(MulVT, B);
21434 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21436 // Only multiply lo/hi halves that aren't known to be zero.
21437 SDValue AloBlo = Zero;
21438 if (!ALoIsZero && !BLoIsZero)
21439 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21441 SDValue AloBhi = Zero;
21442 if (!ALoIsZero && !BHiIsZero) {
21443 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21444 Bhi = DAG.getBitcast(MulVT, Bhi);
21445 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21448 SDValue AhiBlo = Zero;
21449 if (!AHiIsZero && !BLoIsZero) {
21450 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21451 Ahi = DAG.getBitcast(MulVT, Ahi);
21452 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21455 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21456 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21458 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21461 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21462 SelectionDAG &DAG) {
21464 MVT VT = Op.getSimpleValueType();
21466 // Decompose 256-bit ops into smaller 128-bit ops.
21467 if (VT.is256BitVector() && !Subtarget.hasInt256())
21468 return Lower256IntArith(Op, DAG);
21470 // Only i8 vectors should need custom lowering after this.
21471 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21472 "Unsupported vector type");
21474 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21475 // logical shift down the upper half and pack back to i8.
21476 SDValue A = Op.getOperand(0);
21477 SDValue B = Op.getOperand(1);
21479 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21480 // and then ashr/lshr the upper bits down to the lower bits before multiply.
21481 unsigned Opcode = Op.getOpcode();
21482 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21483 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21485 // AVX2 implementations - extend xmm subvectors to ymm.
21486 if (Subtarget.hasInt256()) {
21487 SDValue Lo = DAG.getIntPtrConstant(0, dl);
21488 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21490 if (VT == MVT::v32i8) {
21491 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21492 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21493 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21494 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21495 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21496 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21497 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21498 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21499 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21500 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21501 DAG.getConstant(8, dl, MVT::v16i16));
21502 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21503 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21504 DAG.getConstant(8, dl, MVT::v16i16));
21505 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21506 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21507 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
21508 16, 17, 18, 19, 20, 21, 22, 23};
21509 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21510 24, 25, 26, 27, 28, 29, 30, 31};
21511 return DAG.getNode(X86ISD::PACKUS, dl, VT,
21512 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21513 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21516 SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21517 SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21518 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21519 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21520 DAG.getConstant(8, dl, MVT::v16i16));
21521 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21522 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21523 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21526 assert(VT == MVT::v16i8 &&
21527 "Pre-AVX2 support only supports v16i8 multiplication");
21528 MVT ExVT = MVT::v8i16;
21530 // Extract the lo parts and zero/sign extend to i16.
21532 if (Subtarget.hasSSE41()) {
21533 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21534 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21536 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21537 -1, 4, -1, 5, -1, 6, -1, 7};
21538 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21539 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21540 ALo = DAG.getBitcast(ExVT, ALo);
21541 BLo = DAG.getBitcast(ExVT, BLo);
21542 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21543 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21546 // Extract the hi parts and zero/sign extend to i16.
21548 if (Subtarget.hasSSE41()) {
21549 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21550 -1, -1, -1, -1, -1, -1, -1, -1};
21551 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21552 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21553 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21554 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21556 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21557 -1, 12, -1, 13, -1, 14, -1, 15};
21558 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21559 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21560 AHi = DAG.getBitcast(ExVT, AHi);
21561 BHi = DAG.getBitcast(ExVT, BHi);
21562 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21563 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21566 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21567 // pack back to v16i8.
21568 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21569 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21570 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21571 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21572 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21575 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21576 assert(Subtarget.isTargetWin64() && "Unexpected target");
21577 EVT VT = Op.getValueType();
21578 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21579 "Unexpected return type for lowering");
21583 switch (Op->getOpcode()) {
21584 default: llvm_unreachable("Unexpected request for libcall!");
21585 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
21586 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
21587 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
21588 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
21589 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
21590 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21594 SDValue InChain = DAG.getEntryNode();
21596 TargetLowering::ArgListTy Args;
21597 TargetLowering::ArgListEntry Entry;
21598 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21599 EVT ArgVT = Op->getOperand(i).getValueType();
21600 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21601 "Unexpected argument type for lowering");
21602 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21603 Entry.Node = StackPtr;
21604 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21605 MachinePointerInfo(), /* Alignment = */ 16);
21606 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21607 Entry.Ty = PointerType::get(ArgTy,0);
21608 Entry.IsSExt = false;
21609 Entry.IsZExt = false;
21610 Args.push_back(Entry);
21613 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21614 getPointerTy(DAG.getDataLayout()));
21616 TargetLowering::CallLoweringInfo CLI(DAG);
21617 CLI.setDebugLoc(dl)
21620 getLibcallCallingConv(LC),
21621 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21624 .setSExtResult(isSigned)
21625 .setZExtResult(!isSigned);
21627 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21628 return DAG.getBitcast(VT, CallInfo.first);
21631 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21632 SelectionDAG &DAG) {
21633 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21634 MVT VT = Op0.getSimpleValueType();
21637 // Decompose 256-bit ops into smaller 128-bit ops.
21638 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21639 unsigned Opcode = Op.getOpcode();
21640 unsigned NumElems = VT.getVectorNumElements();
21641 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21642 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21643 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21644 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21645 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21646 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21647 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21649 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21650 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21652 return DAG.getMergeValues(Ops, dl);
21655 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21656 (VT == MVT::v8i32 && Subtarget.hasInt256()));
21658 // PMULxD operations multiply each even value (starting at 0) of LHS with
21659 // the related value of RHS and produce a widen result.
21660 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21661 // => <2 x i64> <ae|cg>
21663 // In other word, to have all the results, we need to perform two PMULxD:
21664 // 1. one with the even values.
21665 // 2. one with the odd values.
21666 // To achieve #2, with need to place the odd values at an even position.
21668 // Place the odd value at an even position (basically, shift all values 1
21669 // step to the left):
21670 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21671 // <a|b|c|d> => <b|undef|d|undef>
21672 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21673 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21674 // <e|f|g|h> => <f|undef|h|undef>
21675 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21676 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21678 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21680 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21681 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21683 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21684 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21685 // => <2 x i64> <ae|cg>
21686 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21687 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21688 // => <2 x i64> <bf|dh>
21689 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21691 // Shuffle it back into the right order.
21692 SDValue Highs, Lows;
21693 if (VT == MVT::v8i32) {
21694 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21695 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21696 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21697 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21699 const int HighMask[] = {1, 5, 3, 7};
21700 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21701 const int LowMask[] = {0, 4, 2, 6};
21702 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21705 // If we have a signed multiply but no PMULDQ fix up the high parts of a
21706 // unsigned multiply.
21707 if (IsSigned && !Subtarget.hasSSE41()) {
21708 SDValue ShAmt = DAG.getConstant(
21710 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21711 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21712 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21713 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21714 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21716 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21717 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21720 // The first result of MUL_LOHI is actually the low value, followed by the
21722 SDValue Ops[] = {Lows, Highs};
21723 return DAG.getMergeValues(Ops, dl);
21726 // Return true if the required (according to Opcode) shift-imm form is natively
21727 // supported by the Subtarget
21728 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21730 if (VT.getScalarSizeInBits() < 16)
21733 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21734 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21737 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
21738 (VT.is256BitVector() && Subtarget.hasInt256());
21740 bool AShift = LShift && (Subtarget.hasAVX512() ||
21741 (VT != MVT::v2i64 && VT != MVT::v4i64));
21742 return (Opcode == ISD::SRA) ? AShift : LShift;
21745 // The shift amount is a variable, but it is the same for all vector lanes.
21746 // These instructions are defined together with shift-immediate.
21748 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21750 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21753 // Return true if the required (according to Opcode) variable-shift form is
21754 // natively supported by the Subtarget
21755 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21758 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21761 // vXi16 supported only on AVX-512, BWI
21762 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21765 if (Subtarget.hasAVX512())
21768 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21769 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21770 return (Opcode == ISD::SRA) ? AShift : LShift;
21773 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21774 const X86Subtarget &Subtarget) {
21775 MVT VT = Op.getSimpleValueType();
21777 SDValue R = Op.getOperand(0);
21778 SDValue Amt = Op.getOperand(1);
21780 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21781 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21783 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21784 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21785 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21786 SDValue Ex = DAG.getBitcast(ExVT, R);
21788 // ashr(R, 63) === cmp_slt(R, 0)
21789 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
21790 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
21791 "Unsupported PCMPGT op");
21792 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
21793 getZeroVector(VT, Subtarget, DAG, dl), R);
21796 if (ShiftAmt >= 32) {
21797 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21799 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21800 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21801 ShiftAmt - 32, DAG);
21802 if (VT == MVT::v2i64)
21803 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21804 if (VT == MVT::v4i64)
21805 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21806 {9, 1, 11, 3, 13, 5, 15, 7});
21808 // SRA upper i32, SHL whole i64 and select lower i32.
21809 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21812 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21813 Lower = DAG.getBitcast(ExVT, Lower);
21814 if (VT == MVT::v2i64)
21815 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21816 if (VT == MVT::v4i64)
21817 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21818 {8, 1, 10, 3, 12, 5, 14, 7});
21820 return DAG.getBitcast(VT, Ex);
21823 // Optimize shl/srl/sra with constant shift amount.
21824 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21825 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21826 uint64_t ShiftAmt = ShiftConst->getZExtValue();
21828 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21829 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21831 // i64 SRA needs to be performed as partial shifts.
21832 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21833 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21834 return ArithmeticShiftRight64(ShiftAmt);
21836 if (VT == MVT::v16i8 ||
21837 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
21838 VT == MVT::v64i8) {
21839 unsigned NumElts = VT.getVectorNumElements();
21840 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21842 // Simple i8 add case
21843 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21844 return DAG.getNode(ISD::ADD, dl, VT, R, R);
21846 // ashr(R, 7) === cmp_slt(R, 0)
21847 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21848 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21849 if (VT.is512BitVector()) {
21850 assert(VT == MVT::v64i8 && "Unexpected element type!");
21851 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21852 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21854 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21857 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21858 if (VT == MVT::v16i8 && Subtarget.hasXOP())
21861 if (Op.getOpcode() == ISD::SHL) {
21862 // Make a large shift.
21863 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
21865 SHL = DAG.getBitcast(VT, SHL);
21866 // Zero out the rightmost bits.
21867 return DAG.getNode(ISD::AND, dl, VT, SHL,
21868 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
21870 if (Op.getOpcode() == ISD::SRL) {
21871 // Make a large shift.
21872 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
21874 SRL = DAG.getBitcast(VT, SRL);
21875 // Zero out the leftmost bits.
21876 return DAG.getNode(ISD::AND, dl, VT, SRL,
21877 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
21879 if (Op.getOpcode() == ISD::SRA) {
21880 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
21881 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21883 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
21884 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
21885 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
21888 llvm_unreachable("Unknown shift opcode.");
21893 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21894 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
21895 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
21896 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
21897 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
21899 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
21900 unsigned SubVectorScale = 1;
21901 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
21903 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
21904 Amt = Amt.getOperand(0);
21907 // Peek through any splat that was introduced for i64 shift vectorization.
21908 int SplatIndex = -1;
21909 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
21910 if (SVN->isSplat()) {
21911 SplatIndex = SVN->getSplatIndex();
21912 Amt = Amt.getOperand(0);
21913 assert(SplatIndex < (int)VT.getVectorNumElements() &&
21914 "Splat shuffle referencing second operand");
21917 if (Amt.getOpcode() != ISD::BITCAST ||
21918 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
21921 Amt = Amt.getOperand(0);
21922 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21923 (SubVectorScale * VT.getVectorNumElements());
21924 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
21925 uint64_t ShiftAmt = 0;
21926 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
21927 for (unsigned i = 0; i != Ratio; ++i) {
21928 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
21932 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
21935 // Check remaining shift amounts (if not a splat).
21936 if (SplatIndex < 0) {
21937 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21938 uint64_t ShAmt = 0;
21939 for (unsigned j = 0; j != Ratio; ++j) {
21940 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
21944 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
21946 if (ShAmt != ShiftAmt)
21951 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21952 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21954 if (Op.getOpcode() == ISD::SRA)
21955 return ArithmeticShiftRight64(ShiftAmt);
21961 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
21962 const X86Subtarget &Subtarget) {
21963 MVT VT = Op.getSimpleValueType();
21965 SDValue R = Op.getOperand(0);
21966 SDValue Amt = Op.getOperand(1);
21968 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21969 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21971 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
21972 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
21974 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
21976 MVT EltVT = VT.getVectorElementType();
21978 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
21979 // Check if this build_vector node is doing a splat.
21980 // If so, then set BaseShAmt equal to the splat value.
21981 BaseShAmt = BV->getSplatValue();
21982 if (BaseShAmt && BaseShAmt.isUndef())
21983 BaseShAmt = SDValue();
21985 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
21986 Amt = Amt.getOperand(0);
21988 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
21989 if (SVN && SVN->isSplat()) {
21990 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
21991 SDValue InVec = Amt.getOperand(0);
21992 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
21993 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
21994 "Unexpected shuffle index found!");
21995 BaseShAmt = InVec.getOperand(SplatIdx);
21996 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
21997 if (ConstantSDNode *C =
21998 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
21999 if (C->getZExtValue() == SplatIdx)
22000 BaseShAmt = InVec.getOperand(1);
22005 // Avoid introducing an extract element from a shuffle.
22006 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22007 DAG.getIntPtrConstant(SplatIdx, dl));
22011 if (BaseShAmt.getNode()) {
22012 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22013 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22014 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22015 else if (EltVT.bitsLT(MVT::i32))
22016 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22018 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22022 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22023 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
22024 Amt.getOpcode() == ISD::BITCAST &&
22025 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22026 Amt = Amt.getOperand(0);
22027 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22028 VT.getVectorNumElements();
22029 std::vector<SDValue> Vals(Ratio);
22030 for (unsigned i = 0; i != Ratio; ++i)
22031 Vals[i] = Amt.getOperand(i);
22032 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22033 for (unsigned j = 0; j != Ratio; ++j)
22034 if (Vals[j] != Amt.getOperand(i + j))
22038 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22039 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22044 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22045 SelectionDAG &DAG) {
22046 MVT VT = Op.getSimpleValueType();
22048 SDValue R = Op.getOperand(0);
22049 SDValue Amt = Op.getOperand(1);
22050 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22052 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22053 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22055 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22058 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22061 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22064 // XOP has 128-bit variable logical/arithmetic shifts.
22065 // +ve/-ve Amt = shift left/right.
22066 if (Subtarget.hasXOP() &&
22067 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22068 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22069 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22070 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22071 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22073 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22074 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22075 if (Op.getOpcode() == ISD::SRA)
22076 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22079 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22080 // shifts per-lane and then shuffle the partial results back together.
22081 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22082 // Splat the shift amounts so the scalar shifts above will catch it.
22083 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22084 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22085 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22086 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22087 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22090 // i64 vector arithmetic shift can be emulated with the transform:
22091 // M = lshr(SIGN_MASK, Amt)
22092 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22093 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22094 Op.getOpcode() == ISD::SRA) {
22095 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22096 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22097 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22098 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22099 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22103 // If possible, lower this packed shift into a vector multiply instead of
22104 // expanding it into a sequence of scalar shifts.
22105 // Do this only if the vector shift count is a constant build_vector.
22106 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22107 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22108 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22109 SmallVector<SDValue, 8> Elts;
22110 MVT SVT = VT.getVectorElementType();
22111 unsigned SVTBits = SVT.getSizeInBits();
22112 APInt One(SVTBits, 1);
22113 unsigned NumElems = VT.getVectorNumElements();
22115 for (unsigned i=0; i !=NumElems; ++i) {
22116 SDValue Op = Amt->getOperand(i);
22117 if (Op->isUndef()) {
22118 Elts.push_back(Op);
22122 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22123 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22124 uint64_t ShAmt = C.getZExtValue();
22125 if (ShAmt >= SVTBits) {
22126 Elts.push_back(DAG.getUNDEF(SVT));
22129 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22131 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22132 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22135 // Lower SHL with variable shift amount.
22136 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22137 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22139 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22140 DAG.getConstant(0x3f800000U, dl, VT));
22141 Op = DAG.getBitcast(MVT::v4f32, Op);
22142 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22143 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22146 // If possible, lower this shift as a sequence of two shifts by
22147 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22149 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22151 // Could be rewritten as:
22152 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22154 // The advantage is that the two shifts from the example would be
22155 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22156 // the vector shift into four scalar shifts plus four pairs of vector
22158 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22159 unsigned TargetOpcode = X86ISD::MOVSS;
22160 bool CanBeSimplified;
22161 // The splat value for the first packed shift (the 'X' from the example).
22162 SDValue Amt1 = Amt->getOperand(0);
22163 // The splat value for the second packed shift (the 'Y' from the example).
22164 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22166 // See if it is possible to replace this node with a sequence of
22167 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22168 if (VT == MVT::v4i32) {
22169 // Check if it is legal to use a MOVSS.
22170 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22171 Amt2 == Amt->getOperand(3);
22172 if (!CanBeSimplified) {
22173 // Otherwise, check if we can still simplify this node using a MOVSD.
22174 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22175 Amt->getOperand(2) == Amt->getOperand(3);
22176 TargetOpcode = X86ISD::MOVSD;
22177 Amt2 = Amt->getOperand(2);
22180 // Do similar checks for the case where the machine value type
22182 CanBeSimplified = Amt1 == Amt->getOperand(1);
22183 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22184 CanBeSimplified = Amt2 == Amt->getOperand(i);
22186 if (!CanBeSimplified) {
22187 TargetOpcode = X86ISD::MOVSD;
22188 CanBeSimplified = true;
22189 Amt2 = Amt->getOperand(4);
22190 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22191 CanBeSimplified = Amt1 == Amt->getOperand(i);
22192 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22193 CanBeSimplified = Amt2 == Amt->getOperand(j);
22197 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22198 isa<ConstantSDNode>(Amt2)) {
22199 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22200 MVT CastVT = MVT::v4i32;
22202 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22203 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22205 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22206 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22207 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22208 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22209 if (TargetOpcode == X86ISD::MOVSD)
22210 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22211 BitCast2, {0, 1, 6, 7}));
22212 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22213 BitCast2, {0, 5, 6, 7}));
22217 // v4i32 Non Uniform Shifts.
22218 // If the shift amount is constant we can shift each lane using the SSE2
22219 // immediate shifts, else we need to zero-extend each lane to the lower i64
22220 // and shift using the SSE2 variable shifts.
22221 // The separate results can then be blended together.
22222 if (VT == MVT::v4i32) {
22223 unsigned Opc = Op.getOpcode();
22224 SDValue Amt0, Amt1, Amt2, Amt3;
22226 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22227 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22228 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22229 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22231 // ISD::SHL is handled above but we include it here for completeness.
22234 llvm_unreachable("Unknown target vector shift node");
22236 Opc = X86ISD::VSHL;
22239 Opc = X86ISD::VSRL;
22242 Opc = X86ISD::VSRA;
22245 // The SSE2 shifts use the lower i64 as the same shift amount for
22246 // all lanes and the upper i64 is ignored. These shuffle masks
22247 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22248 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22249 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22250 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22251 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22252 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22255 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22256 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22257 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22258 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22259 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22260 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22261 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22264 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22265 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22266 // make the existing SSE solution better.
22267 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22268 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22269 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22270 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22271 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22272 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22274 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22275 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22276 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22277 return DAG.getNode(ISD::TRUNCATE, dl, VT,
22278 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22281 if (VT == MVT::v16i8 ||
22282 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22283 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22284 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22285 unsigned ShiftOpcode = Op->getOpcode();
22287 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22288 if (VT.is512BitVector()) {
22289 // On AVX512BW targets we make use of the fact that VSELECT lowers
22290 // to a masked blend which selects bytes based just on the sign bit
22291 // extracted to a mask.
22292 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22293 V0 = DAG.getBitcast(VT, V0);
22294 V1 = DAG.getBitcast(VT, V1);
22295 Sel = DAG.getBitcast(VT, Sel);
22296 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22297 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22298 } else if (Subtarget.hasSSE41()) {
22299 // On SSE41 targets we make use of the fact that VSELECT lowers
22300 // to PBLENDVB which selects bytes based just on the sign bit.
22301 V0 = DAG.getBitcast(VT, V0);
22302 V1 = DAG.getBitcast(VT, V1);
22303 Sel = DAG.getBitcast(VT, Sel);
22304 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22306 // On pre-SSE41 targets we test for the sign bit by comparing to
22307 // zero - a negative value will set all bits of the lanes to true
22308 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22309 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22310 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22311 return DAG.getSelect(dl, SelVT, C, V0, V1);
22314 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22315 // We can safely do this using i16 shifts as we're only interested in
22316 // the 3 lower bits of each byte.
22317 Amt = DAG.getBitcast(ExtVT, Amt);
22318 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22319 Amt = DAG.getBitcast(VT, Amt);
22321 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22322 // r = VSELECT(r, shift(r, 4), a);
22324 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22325 R = SignBitSelect(VT, Amt, M, R);
22328 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22330 // r = VSELECT(r, shift(r, 2), a);
22331 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22332 R = SignBitSelect(VT, Amt, M, R);
22335 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22337 // return VSELECT(r, shift(r, 1), a);
22338 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22339 R = SignBitSelect(VT, Amt, M, R);
22343 if (Op->getOpcode() == ISD::SRA) {
22344 // For SRA we need to unpack each byte to the higher byte of a i16 vector
22345 // so we can correctly sign extend. We don't care what happens to the
22347 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22348 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22349 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22350 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22351 ALo = DAG.getBitcast(ExtVT, ALo);
22352 AHi = DAG.getBitcast(ExtVT, AHi);
22353 RLo = DAG.getBitcast(ExtVT, RLo);
22354 RHi = DAG.getBitcast(ExtVT, RHi);
22356 // r = VSELECT(r, shift(r, 4), a);
22357 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22358 DAG.getConstant(4, dl, ExtVT));
22359 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22360 DAG.getConstant(4, dl, ExtVT));
22361 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22362 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22365 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22366 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22368 // r = VSELECT(r, shift(r, 2), a);
22369 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22370 DAG.getConstant(2, dl, ExtVT));
22371 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22372 DAG.getConstant(2, dl, ExtVT));
22373 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22374 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22377 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22378 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22380 // r = VSELECT(r, shift(r, 1), a);
22381 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22382 DAG.getConstant(1, dl, ExtVT));
22383 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22384 DAG.getConstant(1, dl, ExtVT));
22385 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22386 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22388 // Logical shift the result back to the lower byte, leaving a zero upper
22390 // meaning that we can safely pack with PACKUSWB.
22392 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22394 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22395 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22399 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22400 MVT ExtVT = MVT::v8i32;
22401 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22402 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22403 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22404 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22405 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22406 ALo = DAG.getBitcast(ExtVT, ALo);
22407 AHi = DAG.getBitcast(ExtVT, AHi);
22408 RLo = DAG.getBitcast(ExtVT, RLo);
22409 RHi = DAG.getBitcast(ExtVT, RHi);
22410 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22411 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22412 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22413 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22414 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22417 if (VT == MVT::v8i16) {
22418 unsigned ShiftOpcode = Op->getOpcode();
22420 // If we have a constant shift amount, the non-SSE41 path is best as
22421 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22422 bool UseSSE41 = Subtarget.hasSSE41() &&
22423 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22425 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22426 // On SSE41 targets we make use of the fact that VSELECT lowers
22427 // to PBLENDVB which selects bytes based just on the sign bit.
22429 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22430 V0 = DAG.getBitcast(ExtVT, V0);
22431 V1 = DAG.getBitcast(ExtVT, V1);
22432 Sel = DAG.getBitcast(ExtVT, Sel);
22433 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
22435 // On pre-SSE41 targets we splat the sign bit - a negative value will
22436 // set all bits of the lanes to true and VSELECT uses that in
22437 // its OR(AND(V0,C),AND(V1,~C)) lowering.
22439 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22440 return DAG.getSelect(dl, VT, C, V0, V1);
22443 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22445 // On SSE41 targets we need to replicate the shift mask in both
22446 // bytes for PBLENDVB.
22449 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22450 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22452 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22455 // r = VSELECT(r, shift(r, 8), a);
22456 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22457 R = SignBitSelect(Amt, M, R);
22460 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22462 // r = VSELECT(r, shift(r, 4), a);
22463 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22464 R = SignBitSelect(Amt, M, R);
22467 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22469 // r = VSELECT(r, shift(r, 2), a);
22470 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22471 R = SignBitSelect(Amt, M, R);
22474 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22476 // return VSELECT(r, shift(r, 1), a);
22477 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22478 R = SignBitSelect(Amt, M, R);
22482 // Decompose 256-bit shifts into smaller 128-bit shifts.
22483 if (VT.is256BitVector())
22484 return Lower256IntArith(Op, DAG);
22489 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22490 SelectionDAG &DAG) {
22491 MVT VT = Op.getSimpleValueType();
22493 SDValue R = Op.getOperand(0);
22494 SDValue Amt = Op.getOperand(1);
22496 assert(VT.isVector() && "Custom lowering only for vector rotates!");
22497 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22498 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
22500 // XOP has 128-bit vector variable + immediate rotates.
22501 // +ve/-ve Amt = rotate left/right.
22503 // Split 256-bit integers.
22504 if (VT.is256BitVector())
22505 return Lower256IntArith(Op, DAG);
22507 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22509 // Attempt to rotate by immediate.
22510 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22511 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22512 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22513 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
22514 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22515 DAG.getConstant(RotateAmt, DL, MVT::i8));
22519 // Use general rotate by variable (per-element).
22520 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22523 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22524 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22525 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22526 // looks for this combo and may remove the "setcc" instruction if the "setcc"
22527 // has only one use.
22528 SDNode *N = Op.getNode();
22529 SDValue LHS = N->getOperand(0);
22530 SDValue RHS = N->getOperand(1);
22531 unsigned BaseOp = 0;
22532 X86::CondCode Cond;
22534 switch (Op.getOpcode()) {
22535 default: llvm_unreachable("Unknown ovf instruction!");
22537 // A subtract of one will be selected as a INC. Note that INC doesn't
22538 // set CF, so we can't do this for UADDO.
22539 if (isOneConstant(RHS)) {
22540 BaseOp = X86ISD::INC;
22541 Cond = X86::COND_O;
22544 BaseOp = X86ISD::ADD;
22545 Cond = X86::COND_O;
22548 BaseOp = X86ISD::ADD;
22549 Cond = X86::COND_B;
22552 // A subtract of one will be selected as a DEC. Note that DEC doesn't
22553 // set CF, so we can't do this for USUBO.
22554 if (isOneConstant(RHS)) {
22555 BaseOp = X86ISD::DEC;
22556 Cond = X86::COND_O;
22559 BaseOp = X86ISD::SUB;
22560 Cond = X86::COND_O;
22563 BaseOp = X86ISD::SUB;
22564 Cond = X86::COND_B;
22567 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22568 Cond = X86::COND_O;
22570 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22571 if (N->getValueType(0) == MVT::i8) {
22572 BaseOp = X86ISD::UMUL8;
22573 Cond = X86::COND_O;
22576 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22578 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22580 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22582 if (N->getValueType(1) == MVT::i1)
22583 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22585 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22589 // Also sets EFLAGS.
22590 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22591 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22593 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22595 if (N->getValueType(1) == MVT::i1)
22596 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22598 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22601 /// Returns true if the operand type is exactly twice the native width, and
22602 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22603 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22604 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22605 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22606 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22609 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22610 else if (OpWidth == 128)
22611 return Subtarget.hasCmpxchg16b();
22616 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22617 return needsCmpXchgNb(SI->getValueOperand()->getType());
22620 // Note: this turns large loads into lock cmpxchg8b/16b.
22621 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22622 TargetLowering::AtomicExpansionKind
22623 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22624 auto PTy = cast<PointerType>(LI->getPointerOperandType());
22625 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22626 : AtomicExpansionKind::None;
22629 TargetLowering::AtomicExpansionKind
22630 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22631 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22632 Type *MemType = AI->getType();
22634 // If the operand is too big, we must see if cmpxchg8/16b is available
22635 // and default to library calls otherwise.
22636 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22637 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22638 : AtomicExpansionKind::None;
22641 AtomicRMWInst::BinOp Op = AI->getOperation();
22644 llvm_unreachable("Unknown atomic operation");
22645 case AtomicRMWInst::Xchg:
22646 case AtomicRMWInst::Add:
22647 case AtomicRMWInst::Sub:
22648 // It's better to use xadd, xsub or xchg for these in all cases.
22649 return AtomicExpansionKind::None;
22650 case AtomicRMWInst::Or:
22651 case AtomicRMWInst::And:
22652 case AtomicRMWInst::Xor:
22653 // If the atomicrmw's result isn't actually used, we can just add a "lock"
22654 // prefix to a normal instruction for these operations.
22655 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22656 : AtomicExpansionKind::None;
22657 case AtomicRMWInst::Nand:
22658 case AtomicRMWInst::Max:
22659 case AtomicRMWInst::Min:
22660 case AtomicRMWInst::UMax:
22661 case AtomicRMWInst::UMin:
22662 // These always require a non-trivial set of data operations on x86. We must
22663 // use a cmpxchg loop.
22664 return AtomicExpansionKind::CmpXChg;
22669 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22670 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22671 Type *MemType = AI->getType();
22672 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22673 // there is no benefit in turning such RMWs into loads, and it is actually
22674 // harmful as it introduces a mfence.
22675 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22678 auto Builder = IRBuilder<>(AI);
22679 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22680 auto SynchScope = AI->getSynchScope();
22681 // We must restrict the ordering to avoid generating loads with Release or
22682 // ReleaseAcquire orderings.
22683 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22684 auto Ptr = AI->getPointerOperand();
22686 // Before the load we need a fence. Here is an example lifted from
22687 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22690 // x.store(1, relaxed);
22691 // r1 = y.fetch_add(0, release);
22693 // y.fetch_add(42, acquire);
22694 // r2 = x.load(relaxed);
22695 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22696 // lowered to just a load without a fence. A mfence flushes the store buffer,
22697 // making the optimization clearly correct.
22698 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22699 // otherwise, we might be able to be more aggressive on relaxed idempotent
22700 // rmw. In practice, they do not look useful, so we don't try to be
22701 // especially clever.
22702 if (SynchScope == SingleThread)
22703 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22704 // the IR level, so we must wrap it in an intrinsic.
22707 if (!Subtarget.hasMFence())
22708 // FIXME: it might make sense to use a locked operation here but on a
22709 // different cache-line to prevent cache-line bouncing. In practice it
22710 // is probably a small win, and x86 processors without mfence are rare
22711 // enough that we do not bother.
22715 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22716 Builder.CreateCall(MFence, {});
22718 // Finally we can emit the atomic load.
22719 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22720 AI->getType()->getPrimitiveSizeInBits());
22721 Loaded->setAtomic(Order, SynchScope);
22722 AI->replaceAllUsesWith(Loaded);
22723 AI->eraseFromParent();
22727 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22728 SelectionDAG &DAG) {
22730 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22731 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22732 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
22733 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22735 // The only fence that needs an instruction is a sequentially-consistent
22736 // cross-thread fence.
22737 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22738 FenceScope == CrossThread) {
22739 if (Subtarget.hasMFence())
22740 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22742 SDValue Chain = Op.getOperand(0);
22743 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22745 DAG.getRegister(X86::ESP, MVT::i32), // Base
22746 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
22747 DAG.getRegister(0, MVT::i32), // Index
22748 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
22749 DAG.getRegister(0, MVT::i32), // Segment.
22753 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22754 return SDValue(Res, 0);
22757 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22758 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22761 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22762 SelectionDAG &DAG) {
22763 MVT T = Op.getSimpleValueType();
22767 switch(T.SimpleTy) {
22768 default: llvm_unreachable("Invalid value type!");
22769 case MVT::i8: Reg = X86::AL; size = 1; break;
22770 case MVT::i16: Reg = X86::AX; size = 2; break;
22771 case MVT::i32: Reg = X86::EAX; size = 4; break;
22773 assert(Subtarget.is64Bit() && "Node not type legal!");
22774 Reg = X86::RAX; size = 8;
22777 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22778 Op.getOperand(2), SDValue());
22779 SDValue Ops[] = { cpIn.getValue(0),
22782 DAG.getTargetConstant(size, DL, MVT::i8),
22783 cpIn.getValue(1) };
22784 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22785 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22786 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22790 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22791 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22792 MVT::i32, cpOut.getValue(2));
22793 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22795 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22796 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22797 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22801 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22802 SelectionDAG &DAG) {
22803 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22804 MVT DstVT = Op.getSimpleValueType();
22806 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
22807 SrcVT == MVT::i64) {
22808 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22809 if (DstVT != MVT::f64)
22810 // This conversion needs to be expanded.
22813 SDValue Op0 = Op->getOperand(0);
22814 SmallVector<SDValue, 16> Elts;
22818 if (SrcVT.isVector()) {
22819 NumElts = SrcVT.getVectorNumElements();
22820 SVT = SrcVT.getVectorElementType();
22822 // Widen the vector in input in the case of MVT::v2i32.
22823 // Example: from MVT::v2i32 to MVT::v4i32.
22824 for (unsigned i = 0, e = NumElts; i != e; ++i)
22825 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22826 DAG.getIntPtrConstant(i, dl)));
22828 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
22829 "Unexpected source type in LowerBITCAST");
22830 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22831 DAG.getIntPtrConstant(0, dl)));
22832 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22833 DAG.getIntPtrConstant(1, dl)));
22837 // Explicitly mark the extra elements as Undef.
22838 Elts.append(NumElts, DAG.getUNDEF(SVT));
22840 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22841 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22842 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22843 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22844 DAG.getIntPtrConstant(0, dl));
22847 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
22848 Subtarget.hasMMX() && "Unexpected custom BITCAST");
22849 assert((DstVT == MVT::i64 ||
22850 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
22851 "Unexpected custom BITCAST");
22852 // i64 <=> MMX conversions are Legal.
22853 if (SrcVT==MVT::i64 && DstVT.isVector())
22855 if (DstVT==MVT::i64 && SrcVT.isVector())
22857 // MMX <=> MMX conversions are Legal.
22858 if (SrcVT.isVector() && DstVT.isVector())
22860 // All other conversions need to be expanded.
22864 /// Compute the horizontal sum of bytes in V for the elements of VT.
22866 /// Requires V to be a byte vector and VT to be an integer vector type with
22867 /// wider elements than V's type. The width of the elements of VT determines
22868 /// how many bytes of V are summed horizontally to produce each element of the
22870 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
22871 const X86Subtarget &Subtarget,
22872 SelectionDAG &DAG) {
22874 MVT ByteVecVT = V.getSimpleValueType();
22875 MVT EltVT = VT.getVectorElementType();
22876 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
22877 "Expected value to have byte element type.");
22878 assert(EltVT != MVT::i8 &&
22879 "Horizontal byte sum only makes sense for wider elements!");
22880 unsigned VecSize = VT.getSizeInBits();
22881 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
22883 // PSADBW instruction horizontally add all bytes and leave the result in i64
22884 // chunks, thus directly computes the pop count for v2i64 and v4i64.
22885 if (EltVT == MVT::i64) {
22886 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22887 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22888 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
22889 return DAG.getBitcast(VT, V);
22892 if (EltVT == MVT::i32) {
22893 // We unpack the low half and high half into i32s interleaved with zeros so
22894 // that we can use PSADBW to horizontally sum them. The most useful part of
22895 // this is that it lines up the results of two PSADBW instructions to be
22896 // two v2i64 vectors which concatenated are the 4 population counts. We can
22897 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
22898 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
22899 SDValue V32 = DAG.getBitcast(VT, V);
22900 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
22901 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
22903 // Do the horizontal sums into two v2i64s.
22904 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22905 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22906 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22907 DAG.getBitcast(ByteVecVT, Low), Zeros);
22908 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22909 DAG.getBitcast(ByteVecVT, High), Zeros);
22911 // Merge them together.
22912 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
22913 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
22914 DAG.getBitcast(ShortVecVT, Low),
22915 DAG.getBitcast(ShortVecVT, High));
22917 return DAG.getBitcast(VT, V);
22920 // The only element type left is i16.
22921 assert(EltVT == MVT::i16 && "Unknown how to handle type");
22923 // To obtain pop count for each i16 element starting from the pop count for
22924 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
22925 // right by 8. It is important to shift as i16s as i8 vector shift isn't
22926 // directly supported.
22927 SDValue ShifterV = DAG.getConstant(8, DL, VT);
22928 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22929 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
22930 DAG.getBitcast(ByteVecVT, V));
22931 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22934 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
22935 const X86Subtarget &Subtarget,
22936 SelectionDAG &DAG) {
22937 MVT VT = Op.getSimpleValueType();
22938 MVT EltVT = VT.getVectorElementType();
22939 unsigned VecSize = VT.getSizeInBits();
22941 // Implement a lookup table in register by using an algorithm based on:
22942 // http://wm.ite.pl/articles/sse-popcount.html
22944 // The general idea is that every lower byte nibble in the input vector is an
22945 // index into a in-register pre-computed pop count table. We then split up the
22946 // input vector in two new ones: (1) a vector with only the shifted-right
22947 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
22948 // masked out higher ones) for each byte. PSHUFB is used separately with both
22949 // to index the in-register table. Next, both are added and the result is a
22950 // i8 vector where each element contains the pop count for input byte.
22952 // To obtain the pop count for elements != i8, we follow up with the same
22953 // approach and use additional tricks as described below.
22955 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
22956 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
22957 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
22958 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
22960 int NumByteElts = VecSize / 8;
22961 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
22962 SDValue In = DAG.getBitcast(ByteVecVT, Op);
22963 SmallVector<SDValue, 64> LUTVec;
22964 for (int i = 0; i < NumByteElts; ++i)
22965 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22966 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
22967 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
22970 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
22971 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
22974 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
22976 // The input vector is used as the shuffle mask that index elements into the
22977 // LUT. After counting low and high nibbles, add the vector to obtain the
22978 // final pop count per i8 element.
22979 SDValue HighPopCnt =
22980 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
22981 SDValue LowPopCnt =
22982 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
22983 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
22985 if (EltVT == MVT::i8)
22988 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
22991 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
22992 const X86Subtarget &Subtarget,
22993 SelectionDAG &DAG) {
22994 MVT VT = Op.getSimpleValueType();
22995 assert(VT.is128BitVector() &&
22996 "Only 128-bit vector bitmath lowering supported.");
22998 int VecSize = VT.getSizeInBits();
22999 MVT EltVT = VT.getVectorElementType();
23000 int Len = EltVT.getSizeInBits();
23002 // This is the vectorized version of the "best" algorithm from
23003 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23004 // with a minor tweak to use a series of adds + shifts instead of vector
23005 // multiplications. Implemented for all integer vector types. We only use
23006 // this when we don't have SSSE3 which allows a LUT-based lowering that is
23007 // much faster, even faster than using native popcnt instructions.
23009 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23010 MVT VT = V.getSimpleValueType();
23011 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23012 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23014 auto GetMask = [&](SDValue V, APInt Mask) {
23015 MVT VT = V.getSimpleValueType();
23016 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23017 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23020 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23021 // x86, so set the SRL type to have elements at least i16 wide. This is
23022 // correct because all of our SRLs are followed immediately by a mask anyways
23023 // that handles any bits that sneak into the high bits of the byte elements.
23024 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23028 // v = v - ((v >> 1) & 0x55555555...)
23030 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23031 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23032 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23034 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23035 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23036 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23037 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23038 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23040 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23041 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23042 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23043 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23045 // At this point, V contains the byte-wise population count, and we are
23046 // merely doing a horizontal sum if necessary to get the wider element
23048 if (EltVT == MVT::i8)
23051 return LowerHorizontalByteSum(
23052 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23056 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23057 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23058 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23059 SelectionDAG &DAG) {
23060 MVT VT = Op.getSimpleValueType();
23061 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23062 "Unknown CTPOP type to handle");
23063 SDLoc DL(Op.getNode());
23064 SDValue Op0 = Op.getOperand(0);
23066 if (!Subtarget.hasSSSE3()) {
23067 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23068 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23069 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23072 // Decompose 256-bit ops into smaller 128-bit ops.
23073 if (VT.is256BitVector() && !Subtarget.hasInt256())
23074 return Lower256IntUnary(Op, DAG);
23076 // Decompose 512-bit ops into smaller 256-bit ops.
23077 if (VT.is512BitVector() && !Subtarget.hasBWI())
23078 return Lower512IntUnary(Op, DAG);
23080 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23083 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23084 SelectionDAG &DAG) {
23085 assert(Op.getSimpleValueType().isVector() &&
23086 "We only do custom lowering for vector population count.");
23087 return LowerVectorCTPOP(Op, Subtarget, DAG);
23090 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23091 MVT VT = Op.getSimpleValueType();
23092 SDValue In = Op.getOperand(0);
23095 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23096 // perform the BITREVERSE.
23097 if (!VT.isVector()) {
23098 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23099 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23100 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23101 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23102 DAG.getIntPtrConstant(0, DL));
23105 int NumElts = VT.getVectorNumElements();
23106 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23108 // Decompose 256-bit ops into smaller 128-bit ops.
23109 if (VT.is256BitVector())
23110 return Lower256IntUnary(Op, DAG);
23112 assert(VT.is128BitVector() &&
23113 "Only 128-bit vector bitreverse lowering supported.");
23115 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23116 // perform the BSWAP in the shuffle.
23117 // Its best to shuffle using the second operand as this will implicitly allow
23118 // memory folding for multiple vectors.
23119 SmallVector<SDValue, 16> MaskElts;
23120 for (int i = 0; i != NumElts; ++i) {
23121 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23122 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23123 int PermuteByte = SourceByte | (2 << 5);
23124 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23128 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23129 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23130 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23132 return DAG.getBitcast(VT, Res);
23135 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23136 SelectionDAG &DAG) {
23137 if (Subtarget.hasXOP())
23138 return LowerBITREVERSE_XOP(Op, DAG);
23140 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23142 MVT VT = Op.getSimpleValueType();
23143 SDValue In = Op.getOperand(0);
23146 unsigned NumElts = VT.getVectorNumElements();
23147 assert(VT.getScalarType() == MVT::i8 &&
23148 "Only byte vector BITREVERSE supported");
23150 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23151 if (VT.is256BitVector() && !Subtarget.hasInt256())
23152 return Lower256IntUnary(Op, DAG);
23154 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23155 // two nibbles and a PSHUFB lookup to find the bitreverse of each
23156 // 0-15 value (moved to the other nibble).
23157 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23158 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23159 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23161 const int LoLUT[16] = {
23162 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23163 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23164 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23165 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23166 const int HiLUT[16] = {
23167 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23168 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23169 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23170 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23172 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23173 for (unsigned i = 0; i < NumElts; ++i) {
23174 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23175 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23178 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23179 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23180 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23181 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23182 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23185 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23186 unsigned NewOpc = 0;
23187 switch (N->getOpcode()) {
23188 case ISD::ATOMIC_LOAD_ADD:
23189 NewOpc = X86ISD::LADD;
23191 case ISD::ATOMIC_LOAD_SUB:
23192 NewOpc = X86ISD::LSUB;
23194 case ISD::ATOMIC_LOAD_OR:
23195 NewOpc = X86ISD::LOR;
23197 case ISD::ATOMIC_LOAD_XOR:
23198 NewOpc = X86ISD::LXOR;
23200 case ISD::ATOMIC_LOAD_AND:
23201 NewOpc = X86ISD::LAND;
23204 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23207 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23208 return DAG.getMemIntrinsicNode(
23209 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23210 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23211 /*MemVT=*/N->getSimpleValueType(0), MMO);
23214 /// Lower atomic_load_ops into LOCK-prefixed operations.
23215 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23216 const X86Subtarget &Subtarget) {
23217 SDValue Chain = N->getOperand(0);
23218 SDValue LHS = N->getOperand(1);
23219 SDValue RHS = N->getOperand(2);
23220 unsigned Opc = N->getOpcode();
23221 MVT VT = N->getSimpleValueType(0);
23224 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23225 // can only be lowered when the result is unused. They should have already
23226 // been transformed into a cmpxchg loop in AtomicExpand.
23227 if (N->hasAnyUseOfValue(0)) {
23228 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23229 // select LXADD if LOCK_SUB can't be selected.
23230 if (Opc == ISD::ATOMIC_LOAD_SUB) {
23231 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23232 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23233 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23234 RHS, AN->getMemOperand());
23236 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23237 "Used AtomicRMW ops other than Add should have been expanded!");
23241 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23242 // RAUW the chain, but don't worry about the result, as it's unused.
23243 assert(!N->hasAnyUseOfValue(0));
23244 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23248 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23249 SDNode *Node = Op.getNode();
23251 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23253 // Convert seq_cst store -> xchg
23254 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23255 // FIXME: On 32-bit, store -> fist or movq would be more efficient
23256 // (The only way to get a 16-byte store is cmpxchg16b)
23257 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23258 if (cast<AtomicSDNode>(Node)->getOrdering() ==
23259 AtomicOrdering::SequentiallyConsistent ||
23260 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23261 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23262 cast<AtomicSDNode>(Node)->getMemoryVT(),
23263 Node->getOperand(0),
23264 Node->getOperand(1), Node->getOperand(2),
23265 cast<AtomicSDNode>(Node)->getMemOperand());
23266 return Swap.getValue(1);
23268 // Other atomic stores have a simple pattern.
23272 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
23273 MVT VT = Op.getNode()->getSimpleValueType(0);
23275 // Let legalize expand this if it isn't a legal type yet.
23276 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23279 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23282 bool ExtraOp = false;
23283 switch (Op.getOpcode()) {
23284 default: llvm_unreachable("Invalid code");
23285 case ISD::ADDC: Opc = X86ISD::ADD; break;
23286 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
23287 case ISD::SUBC: Opc = X86ISD::SUB; break;
23288 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
23292 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
23294 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
23295 Op.getOperand(1), Op.getOperand(2));
23298 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23299 SDNode *N = Op.getNode();
23300 MVT VT = N->getSimpleValueType(0);
23302 // Let legalize expand this if it isn't a legal type yet.
23303 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23306 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23309 // Set the carry flag.
23310 SDValue Carry = Op.getOperand(2);
23311 EVT CarryVT = Carry.getValueType();
23312 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23313 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23314 Carry, DAG.getConstant(NegOne, DL, CarryVT));
23316 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
23317 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23318 Op.getOperand(1), Carry.getValue(1));
23320 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23321 if (N->getValueType(1) == MVT::i1)
23322 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23324 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23327 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23328 SelectionDAG &DAG) {
23329 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23331 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23332 // which returns the values as { float, float } (in XMM0) or
23333 // { double, double } (which is returned in XMM0, XMM1).
23335 SDValue Arg = Op.getOperand(0);
23336 EVT ArgVT = Arg.getValueType();
23337 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23339 TargetLowering::ArgListTy Args;
23340 TargetLowering::ArgListEntry Entry;
23344 Entry.IsSExt = false;
23345 Entry.IsZExt = false;
23346 Args.push_back(Entry);
23348 bool isF64 = ArgVT == MVT::f64;
23349 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23350 // the small struct {f32, f32} is returned in (eax, edx). For f64,
23351 // the results are returned via SRet in memory.
23352 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
23353 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23355 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23357 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
23358 : (Type *)VectorType::get(ArgTy, 4);
23360 TargetLowering::CallLoweringInfo CLI(DAG);
23361 CLI.setDebugLoc(dl)
23362 .setChain(DAG.getEntryNode())
23363 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23365 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23368 // Returned in xmm0 and xmm1.
23369 return CallResult.first;
23371 // Returned in bits 0:31 and 32:64 xmm0.
23372 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23373 CallResult.first, DAG.getIntPtrConstant(0, dl));
23374 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23375 CallResult.first, DAG.getIntPtrConstant(1, dl));
23376 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23377 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23380 /// Widen a vector input to a vector of NVT. The
23381 /// input vector must have the same element type as NVT.
23382 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23383 bool FillWithZeroes = false) {
23384 // Check if InOp already has the right width.
23385 MVT InVT = InOp.getSimpleValueType();
23389 if (InOp.isUndef())
23390 return DAG.getUNDEF(NVT);
23392 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23393 "input and widen element type must match");
23395 unsigned InNumElts = InVT.getVectorNumElements();
23396 unsigned WidenNumElts = NVT.getVectorNumElements();
23397 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23398 "Unexpected request for vector widening");
23401 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23402 InOp.getNumOperands() == 2) {
23403 SDValue N1 = InOp.getOperand(1);
23404 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
23406 InOp = InOp.getOperand(0);
23407 InVT = InOp.getSimpleValueType();
23408 InNumElts = InVT.getVectorNumElements();
23411 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23412 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23413 SmallVector<SDValue, 16> Ops;
23414 for (unsigned i = 0; i < InNumElts; ++i)
23415 Ops.push_back(InOp.getOperand(i));
23417 EVT EltVT = InOp.getOperand(0).getValueType();
23419 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23420 DAG.getUNDEF(EltVT);
23421 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23422 Ops.push_back(FillVal);
23423 return DAG.getBuildVector(NVT, dl, Ops);
23425 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23427 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23428 InOp, DAG.getIntPtrConstant(0, dl));
23431 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23432 SelectionDAG &DAG) {
23433 assert(Subtarget.hasAVX512() &&
23434 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23436 // X86 scatter kills mask register, so its type should be added to
23437 // the list of return values.
23438 // If the "scatter" has 2 return values, it is already handled.
23439 if (Op.getNode()->getNumValues() == 2)
23442 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23443 SDValue Src = N->getValue();
23444 MVT VT = Src.getSimpleValueType();
23445 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23448 SDValue NewScatter;
23449 SDValue Index = N->getIndex();
23450 SDValue Mask = N->getMask();
23451 SDValue Chain = N->getChain();
23452 SDValue BasePtr = N->getBasePtr();
23453 MVT MemVT = N->getMemoryVT().getSimpleVT();
23454 MVT IndexVT = Index.getSimpleValueType();
23455 MVT MaskVT = Mask.getSimpleValueType();
23457 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23458 // The v2i32 value was promoted to v2i64.
23459 // Now we "redo" the type legalizer's work and widen the original
23460 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23462 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23463 "Unexpected memory type");
23464 int ShuffleMask[] = {0, 2, -1, -1};
23465 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23466 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23467 // Now we have 4 elements instead of 2.
23468 // Expand the index.
23469 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23470 Index = ExtendToType(Index, NewIndexVT, DAG);
23472 // Expand the mask with zeroes
23473 // Mask may be <2 x i64> or <2 x i1> at this moment
23474 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23475 "Unexpected mask type");
23476 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23477 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23481 unsigned NumElts = VT.getVectorNumElements();
23482 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23483 !Index.getSimpleValueType().is512BitVector()) {
23484 // AVX512F supports only 512-bit vectors. Or data or index should
23485 // be 512 bit wide. If now the both index and data are 256-bit, but
23486 // the vector contains 8 elements, we just sign-extend the index
23487 if (IndexVT == MVT::v8i32)
23488 // Just extend index
23489 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23491 // The minimal number of elts in scatter is 8
23494 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23495 // Use original index here, do not modify the index twice
23496 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23497 if (IndexVT.getScalarType() == MVT::i32)
23498 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23501 // At this point we have promoted mask operand
23502 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23503 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23504 // Use the original mask here, do not modify the mask twice
23505 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23507 // The value that should be stored
23508 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23509 Src = ExtendToType(Src, NewVT, DAG);
23512 // If the mask is "wide" at this point - truncate it to i1 vector
23513 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23514 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23516 // The mask is killed by scatter, add it to the values
23517 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23518 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23519 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23520 N->getMemOperand());
23521 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23522 return SDValue(NewScatter.getNode(), 1);
23525 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23526 SelectionDAG &DAG) {
23528 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23529 MVT VT = Op.getSimpleValueType();
23530 MVT ScalarVT = VT.getScalarType();
23531 SDValue Mask = N->getMask();
23534 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23535 "Expanding masked load is supported on AVX-512 target only!");
23537 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23538 "Expanding masked load is supported for 32 and 64-bit types only!");
23540 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23541 // VLX. These types for exp-loads are handled here.
23542 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23545 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23546 "Cannot lower masked load op.");
23548 assert((ScalarVT.getSizeInBits() >= 32 ||
23549 (Subtarget.hasBWI() &&
23550 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23551 "Unsupported masked load op.");
23553 // This operation is legal for targets with VLX, but without
23554 // VLX the vector should be widened to 512 bit
23555 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23556 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23557 SDValue Src0 = N->getSrc0();
23558 Src0 = ExtendToType(Src0, WideDataVT, DAG);
23560 // Mask element has to be i1.
23561 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23562 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23563 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23565 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23567 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23568 if (MaskEltTy != MVT::i1)
23569 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23570 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23571 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23572 N->getBasePtr(), Mask, Src0,
23573 N->getMemoryVT(), N->getMemOperand(),
23574 N->getExtensionType(),
23575 N->isExpandingLoad());
23577 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23578 NewLoad.getValue(0),
23579 DAG.getIntPtrConstant(0, dl));
23580 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23581 return DAG.getMergeValues(RetOps, dl);
23584 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23585 SelectionDAG &DAG) {
23586 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23587 SDValue DataToStore = N->getValue();
23588 MVT VT = DataToStore.getSimpleValueType();
23589 MVT ScalarVT = VT.getScalarType();
23590 SDValue Mask = N->getMask();
23593 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23594 "Expanding masked load is supported on AVX-512 target only!");
23596 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23597 "Expanding masked load is supported for 32 and 64-bit types only!");
23599 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23600 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23603 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23604 "Cannot lower masked store op.");
23606 assert((ScalarVT.getSizeInBits() >= 32 ||
23607 (Subtarget.hasBWI() &&
23608 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23609 "Unsupported masked store op.");
23611 // This operation is legal for targets with VLX, but without
23612 // VLX the vector should be widened to 512 bit
23613 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23614 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23616 // Mask element has to be i1.
23617 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23618 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23619 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23621 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23623 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23624 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23625 if (MaskEltTy != MVT::i1)
23626 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23627 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23628 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23629 Mask, N->getMemoryVT(), N->getMemOperand(),
23630 N->isTruncatingStore(), N->isCompressingStore());
23633 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23634 SelectionDAG &DAG) {
23635 assert(Subtarget.hasAVX512() &&
23636 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23638 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23640 MVT VT = Op.getSimpleValueType();
23641 SDValue Index = N->getIndex();
23642 SDValue Mask = N->getMask();
23643 SDValue Src0 = N->getValue();
23644 MVT IndexVT = Index.getSimpleValueType();
23645 MVT MaskVT = Mask.getSimpleValueType();
23647 unsigned NumElts = VT.getVectorNumElements();
23648 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23650 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23651 !Index.getSimpleValueType().is512BitVector()) {
23652 // AVX512F supports only 512-bit vectors. Or data or index should
23653 // be 512 bit wide. If now the both index and data are 256-bit, but
23654 // the vector contains 8 elements, we just sign-extend the index
23655 if (NumElts == 8) {
23656 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23657 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
23658 N->getOperand(3), Index };
23659 DAG.UpdateNodeOperands(N, Ops);
23663 // Minimal number of elements in Gather
23666 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23667 Index = ExtendToType(Index, NewIndexVT, DAG);
23668 if (IndexVT.getScalarType() == MVT::i32)
23669 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23672 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23673 // At this point we have promoted mask operand
23674 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23675 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23676 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23677 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23679 // The pass-through value
23680 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23681 Src0 = ExtendToType(Src0, NewVT, DAG);
23683 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23684 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23685 N->getMemoryVT(), dl, Ops,
23686 N->getMemOperand());
23687 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23688 NewGather.getValue(0),
23689 DAG.getIntPtrConstant(0, dl));
23690 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23691 return DAG.getMergeValues(RetOps, dl);
23696 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23697 SelectionDAG &DAG) const {
23698 // TODO: Eventually, the lowering of these nodes should be informed by or
23699 // deferred to the GC strategy for the function in which they appear. For
23700 // now, however, they must be lowered to something. Since they are logically
23701 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23702 // require special handling for these nodes), lower them as literal NOOPs for
23704 SmallVector<SDValue, 2> Ops;
23706 Ops.push_back(Op.getOperand(0));
23707 if (Op->getGluedNode())
23708 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23711 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23712 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23717 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23718 SelectionDAG &DAG) const {
23719 // TODO: Eventually, the lowering of these nodes should be informed by or
23720 // deferred to the GC strategy for the function in which they appear. For
23721 // now, however, they must be lowered to something. Since they are logically
23722 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23723 // require special handling for these nodes), lower them as literal NOOPs for
23725 SmallVector<SDValue, 2> Ops;
23727 Ops.push_back(Op.getOperand(0));
23728 if (Op->getGluedNode())
23729 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23732 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23733 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23738 /// Provide custom lowering hooks for some operations.
23739 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
23740 switch (Op.getOpcode()) {
23741 default: llvm_unreachable("Should not custom lower this!");
23742 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
23743 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
23744 return LowerCMP_SWAP(Op, Subtarget, DAG);
23745 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
23746 case ISD::ATOMIC_LOAD_ADD:
23747 case ISD::ATOMIC_LOAD_SUB:
23748 case ISD::ATOMIC_LOAD_OR:
23749 case ISD::ATOMIC_LOAD_XOR:
23750 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
23751 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
23752 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
23753 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
23754 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23755 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
23756 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
23757 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23758 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
23759 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23760 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23761 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
23762 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
23763 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
23764 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
23765 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
23766 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
23767 case ISD::SHL_PARTS:
23768 case ISD::SRA_PARTS:
23769 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
23770 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
23771 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
23772 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
23773 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
23774 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23775 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
23776 case ISD::ZERO_EXTEND_VECTOR_INREG:
23777 case ISD::SIGN_EXTEND_VECTOR_INREG:
23778 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23779 case ISD::FP_TO_SINT:
23780 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
23781 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
23782 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
23784 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
23785 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
23786 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
23787 case ISD::SETCC: return LowerSETCC(Op, DAG);
23788 case ISD::SETCCE: return LowerSETCCE(Op, DAG);
23789 case ISD::SELECT: return LowerSELECT(Op, DAG);
23790 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
23791 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
23792 case ISD::VASTART: return LowerVASTART(Op, DAG);
23793 case ISD::VAARG: return LowerVAARG(Op, DAG);
23794 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
23795 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23796 case ISD::INTRINSIC_VOID:
23797 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23798 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
23799 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
23800 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
23801 case ISD::FRAME_TO_ARGS_OFFSET:
23802 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23803 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23804 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
23805 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
23806 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
23807 case ISD::EH_SJLJ_SETUP_DISPATCH:
23808 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23809 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
23810 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
23811 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
23813 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
23815 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
23816 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
23818 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
23819 case ISD::UMUL_LOHI:
23820 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
23821 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
23824 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
23830 case ISD::UMULO: return LowerXALUO(Op, DAG);
23831 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23832 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
23836 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
23837 case ISD::ADDCARRY:
23838 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
23840 case ISD::SUB: return LowerADD_SUB(Op, DAG);
23844 case ISD::UMIN: return LowerMINMAX(Op, DAG);
23845 case ISD::ABS: return LowerABS(Op, DAG);
23846 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
23847 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
23848 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
23849 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
23850 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
23851 case ISD::GC_TRANSITION_START:
23852 return LowerGC_TRANSITION_START(Op, DAG);
23853 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
23854 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
23858 /// Places new result values for the node in Results (their number
23859 /// and types must exactly match those of the original return values of
23860 /// the node), or leaves Results empty, which indicates that the node is not
23861 /// to be custom lowered after all.
23862 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
23863 SmallVectorImpl<SDValue> &Results,
23864 SelectionDAG &DAG) const {
23865 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
23867 if (!Res.getNode())
23870 assert((N->getNumValues() <= Res->getNumValues()) &&
23871 "Lowering returned the wrong number of results!");
23873 // Places new result values base on N result number.
23874 // In some cases (LowerSINT_TO_FP for example) Res has more result values
23875 // than original node, chain should be dropped(last value).
23876 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
23877 Results.push_back(Res.getValue(I));
23880 /// Replace a node with an illegal result type with a new node built out of
23882 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
23883 SmallVectorImpl<SDValue>&Results,
23884 SelectionDAG &DAG) const {
23886 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23887 switch (N->getOpcode()) {
23889 llvm_unreachable("Do not know how to custom type legalize this operation!");
23890 case X86ISD::AVG: {
23891 // Legalize types for X86ISD::AVG by expanding vectors.
23892 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23894 auto InVT = N->getValueType(0);
23895 auto InVTSize = InVT.getSizeInBits();
23896 const unsigned RegSize =
23897 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
23898 assert((Subtarget.hasBWI() || RegSize < 512) &&
23899 "512-bit vector requires AVX512BW");
23900 assert((Subtarget.hasAVX2() || RegSize < 256) &&
23901 "256-bit vector requires AVX2");
23903 auto ElemVT = InVT.getVectorElementType();
23904 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
23905 RegSize / ElemVT.getSizeInBits());
23906 assert(RegSize % InVT.getSizeInBits() == 0);
23907 unsigned NumConcat = RegSize / InVT.getSizeInBits();
23909 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
23910 Ops[0] = N->getOperand(0);
23911 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23912 Ops[0] = N->getOperand(1);
23913 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23915 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
23916 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
23917 DAG.getIntPtrConstant(0, dl)));
23920 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
23921 case X86ISD::FMINC:
23923 case X86ISD::FMAXC:
23924 case X86ISD::FMAX: {
23925 EVT VT = N->getValueType(0);
23926 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
23927 SDValue UNDEF = DAG.getUNDEF(VT);
23928 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23929 N->getOperand(0), UNDEF);
23930 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23931 N->getOperand(1), UNDEF);
23932 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
23940 case ISD::UDIVREM: {
23941 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
23942 Results.push_back(V);
23945 case ISD::FP_TO_SINT:
23946 case ISD::FP_TO_UINT: {
23947 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
23949 if (N->getValueType(0) == MVT::v2i32) {
23950 assert((IsSigned || Subtarget.hasAVX512()) &&
23951 "Can only handle signed conversion without AVX512");
23952 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23953 SDValue Src = N->getOperand(0);
23954 if (Src.getValueType() == MVT::v2f64) {
23955 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23956 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
23957 : X86ISD::CVTTP2UI,
23958 dl, MVT::v4i32, Src);
23959 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23960 Results.push_back(Res);
23963 if (Src.getValueType() == MVT::v2f32) {
23964 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23965 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23966 DAG.getUNDEF(MVT::v2f32));
23967 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
23968 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
23969 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23970 Results.push_back(Res);
23974 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
23975 // so early out here.
23979 std::pair<SDValue,SDValue> Vals =
23980 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
23981 SDValue FIST = Vals.first, StackSlot = Vals.second;
23982 if (FIST.getNode()) {
23983 EVT VT = N->getValueType(0);
23984 // Return a load from the stack slot.
23985 if (StackSlot.getNode())
23987 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
23989 Results.push_back(FIST);
23993 case ISD::SINT_TO_FP: {
23994 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
23995 SDValue Src = N->getOperand(0);
23996 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
23998 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24001 case ISD::UINT_TO_FP: {
24002 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24003 EVT VT = N->getValueType(0);
24004 if (VT != MVT::v2f32)
24006 SDValue Src = N->getOperand(0);
24007 EVT SrcVT = Src.getValueType();
24008 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24009 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24012 if (SrcVT != MVT::v2i32)
24014 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24016 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24017 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24018 DAG.getBitcast(MVT::v2i64, VBias));
24019 Or = DAG.getBitcast(MVT::v2f64, Or);
24020 // TODO: Are there any fast-math-flags to propagate here?
24021 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24022 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24025 case ISD::FP_ROUND: {
24026 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24028 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24029 Results.push_back(V);
24032 case ISD::FP_EXTEND: {
24033 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24034 // No other ValueType for FP_EXTEND should reach this point.
24035 assert(N->getValueType(0) == MVT::v2f32 &&
24036 "Do not know how to legalize this Node");
24039 case ISD::INTRINSIC_W_CHAIN: {
24040 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24042 default : llvm_unreachable("Do not know how to custom type "
24043 "legalize this intrinsic operation!");
24044 case Intrinsic::x86_rdtsc:
24045 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24047 case Intrinsic::x86_rdtscp:
24048 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24050 case Intrinsic::x86_rdpmc:
24051 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24053 case Intrinsic::x86_xgetbv:
24054 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24057 case ISD::INTRINSIC_WO_CHAIN: {
24058 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24059 Results.push_back(V);
24062 case ISD::READCYCLECOUNTER: {
24063 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24066 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24067 EVT T = N->getValueType(0);
24068 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24069 bool Regs64bit = T == MVT::i128;
24070 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24071 SDValue cpInL, cpInH;
24072 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24073 DAG.getConstant(0, dl, HalfT));
24074 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24075 DAG.getConstant(1, dl, HalfT));
24076 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24077 Regs64bit ? X86::RAX : X86::EAX,
24079 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24080 Regs64bit ? X86::RDX : X86::EDX,
24081 cpInH, cpInL.getValue(1));
24082 SDValue swapInL, swapInH;
24083 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24084 DAG.getConstant(0, dl, HalfT));
24085 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24086 DAG.getConstant(1, dl, HalfT));
24088 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24089 swapInH, cpInH.getValue(1));
24090 // If the current function needs the base pointer, RBX,
24091 // we shouldn't use cmpxchg directly.
24092 // Indeed the lowering of that instruction will clobber
24093 // that register and since RBX will be a reserved register
24094 // the register allocator will not make sure its value will
24095 // be properly saved and restored around this live-range.
24096 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24098 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24099 unsigned BasePtr = TRI->getBaseRegister();
24100 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24101 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24102 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24103 // ISel prefers the LCMPXCHG64 variant.
24104 // If that assert breaks, that means it is not the case anymore,
24105 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24106 // not just EBX. This is a matter of accepting i64 input for that
24107 // pseudo, and restoring into the register of the right wide
24108 // in expand pseudo. Everything else should just work.
24109 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24110 "Saving only half of the RBX");
24111 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24112 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24113 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24114 Regs64bit ? X86::RBX : X86::EBX,
24115 HalfT, swapInH.getValue(1));
24116 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24118 /*Glue*/ RBXSave.getValue(2)};
24119 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24122 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24123 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24124 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24125 swapInH.getValue(1));
24126 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24127 swapInL.getValue(1)};
24128 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24130 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24131 Regs64bit ? X86::RAX : X86::EAX,
24132 HalfT, Result.getValue(1));
24133 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24134 Regs64bit ? X86::RDX : X86::EDX,
24135 HalfT, cpOutL.getValue(2));
24136 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24138 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24139 MVT::i32, cpOutH.getValue(2));
24140 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24141 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24143 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24144 Results.push_back(Success);
24145 Results.push_back(EFLAGS.getValue(1));
24148 case ISD::ATOMIC_SWAP:
24149 case ISD::ATOMIC_LOAD_ADD:
24150 case ISD::ATOMIC_LOAD_SUB:
24151 case ISD::ATOMIC_LOAD_AND:
24152 case ISD::ATOMIC_LOAD_OR:
24153 case ISD::ATOMIC_LOAD_XOR:
24154 case ISD::ATOMIC_LOAD_NAND:
24155 case ISD::ATOMIC_LOAD_MIN:
24156 case ISD::ATOMIC_LOAD_MAX:
24157 case ISD::ATOMIC_LOAD_UMIN:
24158 case ISD::ATOMIC_LOAD_UMAX:
24159 case ISD::ATOMIC_LOAD: {
24160 // Delegate to generic TypeLegalization. Situations we can really handle
24161 // should have already been dealt with by AtomicExpandPass.cpp.
24164 case ISD::BITCAST: {
24165 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24166 EVT DstVT = N->getValueType(0);
24167 EVT SrcVT = N->getOperand(0)->getValueType(0);
24169 if (SrcVT != MVT::f64 ||
24170 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24173 unsigned NumElts = DstVT.getVectorNumElements();
24174 EVT SVT = DstVT.getVectorElementType();
24175 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24176 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24177 MVT::v2f64, N->getOperand(0));
24178 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24180 if (ExperimentalVectorWideningLegalization) {
24181 // If we are legalizing vectors by widening, we already have the desired
24182 // legal vector type, just return it.
24183 Results.push_back(ToVecInt);
24187 SmallVector<SDValue, 8> Elts;
24188 for (unsigned i = 0, e = NumElts; i != e; ++i)
24189 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24190 ToVecInt, DAG.getIntPtrConstant(i, dl)));
24192 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24197 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24198 switch ((X86ISD::NodeType)Opcode) {
24199 case X86ISD::FIRST_NUMBER: break;
24200 case X86ISD::BSF: return "X86ISD::BSF";
24201 case X86ISD::BSR: return "X86ISD::BSR";
24202 case X86ISD::SHLD: return "X86ISD::SHLD";
24203 case X86ISD::SHRD: return "X86ISD::SHRD";
24204 case X86ISD::FAND: return "X86ISD::FAND";
24205 case X86ISD::FANDN: return "X86ISD::FANDN";
24206 case X86ISD::FOR: return "X86ISD::FOR";
24207 case X86ISD::FXOR: return "X86ISD::FXOR";
24208 case X86ISD::FILD: return "X86ISD::FILD";
24209 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
24210 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24211 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24212 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24213 case X86ISD::FLD: return "X86ISD::FLD";
24214 case X86ISD::FST: return "X86ISD::FST";
24215 case X86ISD::CALL: return "X86ISD::CALL";
24216 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
24217 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
24218 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
24219 case X86ISD::BT: return "X86ISD::BT";
24220 case X86ISD::CMP: return "X86ISD::CMP";
24221 case X86ISD::COMI: return "X86ISD::COMI";
24222 case X86ISD::UCOMI: return "X86ISD::UCOMI";
24223 case X86ISD::CMPM: return "X86ISD::CMPM";
24224 case X86ISD::CMPMU: return "X86ISD::CMPMU";
24225 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
24226 case X86ISD::SETCC: return "X86ISD::SETCC";
24227 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
24228 case X86ISD::FSETCC: return "X86ISD::FSETCC";
24229 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
24230 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
24231 case X86ISD::CMOV: return "X86ISD::CMOV";
24232 case X86ISD::BRCOND: return "X86ISD::BRCOND";
24233 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
24234 case X86ISD::IRET: return "X86ISD::IRET";
24235 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
24236 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
24237 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
24238 case X86ISD::Wrapper: return "X86ISD::Wrapper";
24239 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
24240 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
24241 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
24242 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
24243 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
24244 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
24245 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
24246 case X86ISD::PINSRB: return "X86ISD::PINSRB";
24247 case X86ISD::PINSRW: return "X86ISD::PINSRW";
24248 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
24249 case X86ISD::ANDNP: return "X86ISD::ANDNP";
24250 case X86ISD::BLENDI: return "X86ISD::BLENDI";
24251 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
24252 case X86ISD::ADDUS: return "X86ISD::ADDUS";
24253 case X86ISD::SUBUS: return "X86ISD::SUBUS";
24254 case X86ISD::HADD: return "X86ISD::HADD";
24255 case X86ISD::HSUB: return "X86ISD::HSUB";
24256 case X86ISD::FHADD: return "X86ISD::FHADD";
24257 case X86ISD::FHSUB: return "X86ISD::FHSUB";
24258 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
24259 case X86ISD::FMAX: return "X86ISD::FMAX";
24260 case X86ISD::FMAXS: return "X86ISD::FMAXS";
24261 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
24262 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
24263 case X86ISD::FMIN: return "X86ISD::FMIN";
24264 case X86ISD::FMINS: return "X86ISD::FMINS";
24265 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
24266 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
24267 case X86ISD::FMAXC: return "X86ISD::FMAXC";
24268 case X86ISD::FMINC: return "X86ISD::FMINC";
24269 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
24270 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
24271 case X86ISD::FRCP: return "X86ISD::FRCP";
24272 case X86ISD::FRCPS: return "X86ISD::FRCPS";
24273 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
24274 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
24275 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
24276 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
24277 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
24278 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
24279 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
24280 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24281 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24282 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
24283 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
24284 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
24285 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
24286 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
24287 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
24288 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
24289 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24290 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24291 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24292 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24293 case X86ISD::LADD: return "X86ISD::LADD";
24294 case X86ISD::LSUB: return "X86ISD::LSUB";
24295 case X86ISD::LOR: return "X86ISD::LOR";
24296 case X86ISD::LXOR: return "X86ISD::LXOR";
24297 case X86ISD::LAND: return "X86ISD::LAND";
24298 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
24299 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
24300 case X86ISD::VZEXT: return "X86ISD::VZEXT";
24301 case X86ISD::VSEXT: return "X86ISD::VSEXT";
24302 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
24303 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
24304 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
24305 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
24306 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
24307 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
24308 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
24309 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
24310 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
24311 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
24312 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
24313 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
24314 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
24315 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
24316 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
24317 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
24318 case X86ISD::VSHL: return "X86ISD::VSHL";
24319 case X86ISD::VSRL: return "X86ISD::VSRL";
24320 case X86ISD::VSRA: return "X86ISD::VSRA";
24321 case X86ISD::VSHLI: return "X86ISD::VSHLI";
24322 case X86ISD::VSRLI: return "X86ISD::VSRLI";
24323 case X86ISD::VSRAI: return "X86ISD::VSRAI";
24324 case X86ISD::VSRAV: return "X86ISD::VSRAV";
24325 case X86ISD::VROTLI: return "X86ISD::VROTLI";
24326 case X86ISD::VROTRI: return "X86ISD::VROTRI";
24327 case X86ISD::VPPERM: return "X86ISD::VPPERM";
24328 case X86ISD::CMPP: return "X86ISD::CMPP";
24329 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
24330 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
24331 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
24332 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
24333 case X86ISD::ADD: return "X86ISD::ADD";
24334 case X86ISD::SUB: return "X86ISD::SUB";
24335 case X86ISD::ADC: return "X86ISD::ADC";
24336 case X86ISD::SBB: return "X86ISD::SBB";
24337 case X86ISD::SMUL: return "X86ISD::SMUL";
24338 case X86ISD::UMUL: return "X86ISD::UMUL";
24339 case X86ISD::SMUL8: return "X86ISD::SMUL8";
24340 case X86ISD::UMUL8: return "X86ISD::UMUL8";
24341 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24342 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24343 case X86ISD::INC: return "X86ISD::INC";
24344 case X86ISD::DEC: return "X86ISD::DEC";
24345 case X86ISD::OR: return "X86ISD::OR";
24346 case X86ISD::XOR: return "X86ISD::XOR";
24347 case X86ISD::AND: return "X86ISD::AND";
24348 case X86ISD::BEXTR: return "X86ISD::BEXTR";
24349 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
24350 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
24351 case X86ISD::PTEST: return "X86ISD::PTEST";
24352 case X86ISD::TESTP: return "X86ISD::TESTP";
24353 case X86ISD::TESTM: return "X86ISD::TESTM";
24354 case X86ISD::TESTNM: return "X86ISD::TESTNM";
24355 case X86ISD::KORTEST: return "X86ISD::KORTEST";
24356 case X86ISD::KTEST: return "X86ISD::KTEST";
24357 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
24358 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
24359 case X86ISD::PACKSS: return "X86ISD::PACKSS";
24360 case X86ISD::PACKUS: return "X86ISD::PACKUS";
24361 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
24362 case X86ISD::VALIGN: return "X86ISD::VALIGN";
24363 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
24364 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
24365 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
24366 case X86ISD::SHUFP: return "X86ISD::SHUFP";
24367 case X86ISD::SHUF128: return "X86ISD::SHUF128";
24368 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
24369 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
24370 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
24371 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
24372 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
24373 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
24374 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
24375 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
24376 case X86ISD::MOVSD: return "X86ISD::MOVSD";
24377 case X86ISD::MOVSS: return "X86ISD::MOVSS";
24378 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
24379 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
24380 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
24381 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
24382 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
24383 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
24384 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
24385 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
24386 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
24387 case X86ISD::VPERMV: return "X86ISD::VPERMV";
24388 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
24389 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
24390 case X86ISD::VPERMI: return "X86ISD::VPERMI";
24391 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
24392 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
24393 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
24394 case X86ISD::VRANGE: return "X86ISD::VRANGE";
24395 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
24396 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
24397 case X86ISD::PSADBW: return "X86ISD::PSADBW";
24398 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
24399 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24400 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
24401 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
24402 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
24403 case X86ISD::MFENCE: return "X86ISD::MFENCE";
24404 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
24405 case X86ISD::SAHF: return "X86ISD::SAHF";
24406 case X86ISD::RDRAND: return "X86ISD::RDRAND";
24407 case X86ISD::RDSEED: return "X86ISD::RDSEED";
24408 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
24409 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
24410 case X86ISD::VPROT: return "X86ISD::VPROT";
24411 case X86ISD::VPROTI: return "X86ISD::VPROTI";
24412 case X86ISD::VPSHA: return "X86ISD::VPSHA";
24413 case X86ISD::VPSHL: return "X86ISD::VPSHL";
24414 case X86ISD::VPCOM: return "X86ISD::VPCOM";
24415 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
24416 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
24417 case X86ISD::FMADD: return "X86ISD::FMADD";
24418 case X86ISD::FMSUB: return "X86ISD::FMSUB";
24419 case X86ISD::FNMADD: return "X86ISD::FNMADD";
24420 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
24421 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
24422 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
24423 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
24424 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
24425 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
24426 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
24427 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
24428 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
24429 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
24430 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
24431 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
24432 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
24433 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
24434 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
24435 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
24436 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
24437 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
24438 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
24439 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
24440 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
24441 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
24442 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
24443 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
24444 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
24445 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
24446 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
24447 case X86ISD::XTEST: return "X86ISD::XTEST";
24448 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
24449 case X86ISD::EXPAND: return "X86ISD::EXPAND";
24450 case X86ISD::SELECT: return "X86ISD::SELECT";
24451 case X86ISD::SELECTS: return "X86ISD::SELECTS";
24452 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
24453 case X86ISD::RCP28: return "X86ISD::RCP28";
24454 case X86ISD::RCP28S: return "X86ISD::RCP28S";
24455 case X86ISD::EXP2: return "X86ISD::EXP2";
24456 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
24457 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
24458 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
24459 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
24460 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
24461 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
24462 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
24463 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
24464 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
24465 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
24466 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
24467 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
24468 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
24469 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
24470 case X86ISD::SCALEF: return "X86ISD::SCALEF";
24471 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
24472 case X86ISD::ADDS: return "X86ISD::ADDS";
24473 case X86ISD::SUBS: return "X86ISD::SUBS";
24474 case X86ISD::AVG: return "X86ISD::AVG";
24475 case X86ISD::MULHRS: return "X86ISD::MULHRS";
24476 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
24477 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
24478 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
24479 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
24480 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
24481 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
24482 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
24483 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
24484 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
24485 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
24486 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
24487 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
24488 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
24489 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24490 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24491 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
24492 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
24493 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
24494 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
24495 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
24496 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
24497 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
24498 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
24499 case X86ISD::LWPINS: return "X86ISD::LWPINS";
24504 /// Return true if the addressing mode represented by AM is legal for this
24505 /// target, for a load/store of the specified type.
24506 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24507 const AddrMode &AM, Type *Ty,
24508 unsigned AS) const {
24509 // X86 supports extremely general addressing modes.
24510 CodeModel::Model M = getTargetMachine().getCodeModel();
24512 // X86 allows a sign-extended 32-bit immediate field as a displacement.
24513 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24517 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24519 // If a reference to this global requires an extra load, we can't fold it.
24520 if (isGlobalStubReference(GVFlags))
24523 // If BaseGV requires a register for the PIC base, we cannot also have a
24524 // BaseReg specified.
24525 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24528 // If lower 4G is not available, then we must use rip-relative addressing.
24529 if ((M != CodeModel::Small || isPositionIndependent()) &&
24530 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24534 switch (AM.Scale) {
24540 // These scales always work.
24545 // These scales are formed with basereg+scalereg. Only accept if there is
24550 default: // Other stuff never works.
24557 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24558 unsigned Bits = Ty->getScalarSizeInBits();
24560 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24561 // particularly cheaper than those without.
24565 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24566 // variable shifts just as cheap as scalar ones.
24567 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24570 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24571 // fully general vector.
24575 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24576 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24578 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24579 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24580 return NumBits1 > NumBits2;
24583 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24584 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24587 if (!isTypeLegal(EVT::getEVT(Ty1)))
24590 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24592 // Assuming the caller doesn't have a zeroext or signext return parameter,
24593 // truncation all the way down to i1 is valid.
24597 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24598 return isInt<32>(Imm);
24601 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24602 // Can also use sub to handle negated immediates.
24603 return isInt<32>(Imm);
24606 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24607 if (!VT1.isInteger() || !VT2.isInteger())
24609 unsigned NumBits1 = VT1.getSizeInBits();
24610 unsigned NumBits2 = VT2.getSizeInBits();
24611 return NumBits1 > NumBits2;
24614 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24615 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24616 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24619 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24620 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24621 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24624 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24625 EVT VT1 = Val.getValueType();
24626 if (isZExtFree(VT1, VT2))
24629 if (Val.getOpcode() != ISD::LOAD)
24632 if (!VT1.isSimple() || !VT1.isInteger() ||
24633 !VT2.isSimple() || !VT2.isInteger())
24636 switch (VT1.getSimpleVT().SimpleTy) {
24641 // X86 has 8, 16, and 32-bit zero-extending loads.
24648 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24651 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24652 if (!Subtarget.hasAnyFMA())
24655 VT = VT.getScalarType();
24657 if (!VT.isSimple())
24660 switch (VT.getSimpleVT().SimpleTy) {
24671 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24672 // i16 instructions are longer (0x66 prefix) and potentially slower.
24673 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24676 /// Targets can use this to indicate that they only support *some*
24677 /// VECTOR_SHUFFLE operations, those with specific masks.
24678 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24679 /// are assumed to be legal.
24681 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24683 if (!VT.isSimple())
24686 // Not for i1 vectors
24687 if (VT.getSimpleVT().getScalarType() == MVT::i1)
24690 // Very little shuffling can be done for 64-bit vectors right now.
24691 if (VT.getSimpleVT().getSizeInBits() == 64)
24694 // We only care that the types being shuffled are legal. The lowering can
24695 // handle any possible shuffle mask that results.
24696 return isTypeLegal(VT.getSimpleVT());
24700 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24702 // Just delegate to the generic legality, clear masks aren't special.
24703 return isShuffleMaskLegal(Mask, VT);
24706 //===----------------------------------------------------------------------===//
24707 // X86 Scheduler Hooks
24708 //===----------------------------------------------------------------------===//
24710 /// Utility function to emit xbegin specifying the start of an RTM region.
24711 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24712 const TargetInstrInfo *TII) {
24713 DebugLoc DL = MI.getDebugLoc();
24715 const BasicBlock *BB = MBB->getBasicBlock();
24716 MachineFunction::iterator I = ++MBB->getIterator();
24718 // For the v = xbegin(), we generate
24727 // eax = # XABORT_DEF
24731 // v = phi(s0/mainBB, s1/fallBB)
24733 MachineBasicBlock *thisMBB = MBB;
24734 MachineFunction *MF = MBB->getParent();
24735 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
24736 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
24737 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
24738 MF->insert(I, mainMBB);
24739 MF->insert(I, fallMBB);
24740 MF->insert(I, sinkMBB);
24742 // Transfer the remainder of BB and its successor edges to sinkMBB.
24743 sinkMBB->splice(sinkMBB->begin(), MBB,
24744 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24745 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
24747 MachineRegisterInfo &MRI = MF->getRegInfo();
24748 unsigned DstReg = MI.getOperand(0).getReg();
24749 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
24750 unsigned mainDstReg = MRI.createVirtualRegister(RC);
24751 unsigned fallDstReg = MRI.createVirtualRegister(RC);
24755 // # fallthrough to mainMBB
24756 // # abortion to fallMBB
24757 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
24758 thisMBB->addSuccessor(mainMBB);
24759 thisMBB->addSuccessor(fallMBB);
24762 // mainDstReg := -1
24763 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
24764 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
24765 mainMBB->addSuccessor(sinkMBB);
24768 // ; pseudo instruction to model hardware's definition from XABORT
24769 // EAX := XABORT_DEF
24770 // fallDstReg := EAX
24771 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
24772 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
24774 fallMBB->addSuccessor(sinkMBB);
24777 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
24778 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
24779 .addReg(mainDstReg).addMBB(mainMBB)
24780 .addReg(fallDstReg).addMBB(fallMBB);
24782 MI.eraseFromParent();
24786 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24787 // or XMM0_V32I8 in AVX all of this code can be replaced with that
24788 // in the .td file.
24789 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
24790 const TargetInstrInfo *TII) {
24792 switch (MI.getOpcode()) {
24793 default: llvm_unreachable("illegal opcode!");
24794 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
24795 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
24796 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
24797 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
24798 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
24799 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
24800 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
24801 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
24804 DebugLoc dl = MI.getDebugLoc();
24805 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24807 unsigned NumArgs = MI.getNumOperands();
24808 for (unsigned i = 1; i < NumArgs; ++i) {
24809 MachineOperand &Op = MI.getOperand(i);
24810 if (!(Op.isReg() && Op.isImplicit()))
24813 if (MI.hasOneMemOperand())
24814 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24816 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24817 .addReg(X86::XMM0);
24819 MI.eraseFromParent();
24823 // FIXME: Custom handling because TableGen doesn't support multiple implicit
24824 // defs in an instruction pattern
24825 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
24826 const TargetInstrInfo *TII) {
24828 switch (MI.getOpcode()) {
24829 default: llvm_unreachable("illegal opcode!");
24830 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
24831 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24832 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
24833 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24834 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
24835 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24836 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
24837 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
24840 DebugLoc dl = MI.getDebugLoc();
24841 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24843 unsigned NumArgs = MI.getNumOperands(); // remove the results
24844 for (unsigned i = 1; i < NumArgs; ++i) {
24845 MachineOperand &Op = MI.getOperand(i);
24846 if (!(Op.isReg() && Op.isImplicit()))
24849 if (MI.hasOneMemOperand())
24850 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24852 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24855 MI.eraseFromParent();
24859 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24860 const X86Subtarget &Subtarget) {
24861 DebugLoc dl = MI.getDebugLoc();
24862 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24864 // insert input VAL into EAX
24865 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
24866 .addReg(MI.getOperand(0).getReg());
24867 // insert zero to ECX
24868 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24870 // insert zero to EDX
24871 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
24873 // insert WRPKRU instruction
24874 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
24876 MI.eraseFromParent(); // The pseudo is gone now.
24880 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24881 const X86Subtarget &Subtarget) {
24882 DebugLoc dl = MI.getDebugLoc();
24883 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24885 // insert zero to ECX
24886 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24888 // insert RDPKRU instruction
24889 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
24890 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24893 MI.eraseFromParent(); // The pseudo is gone now.
24897 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
24898 const X86Subtarget &Subtarget,
24900 DebugLoc dl = MI.getDebugLoc();
24901 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24902 // Address into RAX/EAX, other two args into ECX, EDX.
24903 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24904 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24905 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24906 for (int i = 0; i < X86::AddrNumOperands; ++i)
24907 MIB.add(MI.getOperand(i));
24909 unsigned ValOps = X86::AddrNumOperands;
24910 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
24911 .addReg(MI.getOperand(ValOps).getReg());
24912 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
24913 .addReg(MI.getOperand(ValOps + 1).getReg());
24915 // The instruction doesn't actually take any operands though.
24916 BuildMI(*BB, MI, dl, TII->get(Opc));
24918 MI.eraseFromParent(); // The pseudo is gone now.
24922 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
24923 const X86Subtarget &Subtarget) {
24924 DebugLoc dl = MI->getDebugLoc();
24925 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24926 // Address into RAX/EAX
24927 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24928 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24929 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24930 for (int i = 0; i < X86::AddrNumOperands; ++i)
24931 MIB.add(MI->getOperand(i));
24933 // The instruction doesn't actually take any operands though.
24934 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
24936 MI->eraseFromParent(); // The pseudo is gone now.
24942 MachineBasicBlock *
24943 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
24944 MachineBasicBlock *MBB) const {
24945 // Emit va_arg instruction on X86-64.
24947 // Operands to this pseudo-instruction:
24948 // 0 ) Output : destination address (reg)
24949 // 1-5) Input : va_list address (addr, i64mem)
24950 // 6 ) ArgSize : Size (in bytes) of vararg type
24951 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
24952 // 8 ) Align : Alignment of type
24953 // 9 ) EFLAGS (implicit-def)
24955 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
24956 static_assert(X86::AddrNumOperands == 5,
24957 "VAARG_64 assumes 5 address operands");
24959 unsigned DestReg = MI.getOperand(0).getReg();
24960 MachineOperand &Base = MI.getOperand(1);
24961 MachineOperand &Scale = MI.getOperand(2);
24962 MachineOperand &Index = MI.getOperand(3);
24963 MachineOperand &Disp = MI.getOperand(4);
24964 MachineOperand &Segment = MI.getOperand(5);
24965 unsigned ArgSize = MI.getOperand(6).getImm();
24966 unsigned ArgMode = MI.getOperand(7).getImm();
24967 unsigned Align = MI.getOperand(8).getImm();
24969 // Memory Reference
24970 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
24971 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
24972 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
24974 // Machine Information
24975 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24976 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
24977 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
24978 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
24979 DebugLoc DL = MI.getDebugLoc();
24981 // struct va_list {
24984 // i64 overflow_area (address)
24985 // i64 reg_save_area (address)
24987 // sizeof(va_list) = 24
24988 // alignment(va_list) = 8
24990 unsigned TotalNumIntRegs = 6;
24991 unsigned TotalNumXMMRegs = 8;
24992 bool UseGPOffset = (ArgMode == 1);
24993 bool UseFPOffset = (ArgMode == 2);
24994 unsigned MaxOffset = TotalNumIntRegs * 8 +
24995 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
24997 /* Align ArgSize to a multiple of 8 */
24998 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
24999 bool NeedsAlign = (Align > 8);
25001 MachineBasicBlock *thisMBB = MBB;
25002 MachineBasicBlock *overflowMBB;
25003 MachineBasicBlock *offsetMBB;
25004 MachineBasicBlock *endMBB;
25006 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
25007 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
25008 unsigned OffsetReg = 0;
25010 if (!UseGPOffset && !UseFPOffset) {
25011 // If we only pull from the overflow region, we don't create a branch.
25012 // We don't need to alter control flow.
25013 OffsetDestReg = 0; // unused
25014 OverflowDestReg = DestReg;
25016 offsetMBB = nullptr;
25017 overflowMBB = thisMBB;
25020 // First emit code to check if gp_offset (or fp_offset) is below the bound.
25021 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25022 // If not, pull from overflow_area. (branch to overflowMBB)
25027 // offsetMBB overflowMBB
25032 // Registers for the PHI in endMBB
25033 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25034 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25036 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25037 MachineFunction *MF = MBB->getParent();
25038 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25039 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25040 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25042 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25044 // Insert the new basic blocks
25045 MF->insert(MBBIter, offsetMBB);
25046 MF->insert(MBBIter, overflowMBB);
25047 MF->insert(MBBIter, endMBB);
25049 // Transfer the remainder of MBB and its successor edges to endMBB.
25050 endMBB->splice(endMBB->begin(), thisMBB,
25051 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25052 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25054 // Make offsetMBB and overflowMBB successors of thisMBB
25055 thisMBB->addSuccessor(offsetMBB);
25056 thisMBB->addSuccessor(overflowMBB);
25058 // endMBB is a successor of both offsetMBB and overflowMBB
25059 offsetMBB->addSuccessor(endMBB);
25060 overflowMBB->addSuccessor(endMBB);
25062 // Load the offset value into a register
25063 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25064 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25068 .addDisp(Disp, UseFPOffset ? 4 : 0)
25070 .setMemRefs(MMOBegin, MMOEnd);
25072 // Check if there is enough room left to pull this argument.
25073 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25075 .addImm(MaxOffset + 8 - ArgSizeA8);
25077 // Branch to "overflowMBB" if offset >= max
25078 // Fall through to "offsetMBB" otherwise
25079 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25080 .addMBB(overflowMBB);
25083 // In offsetMBB, emit code to use the reg_save_area.
25085 assert(OffsetReg != 0);
25087 // Read the reg_save_area address.
25088 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25089 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25095 .setMemRefs(MMOBegin, MMOEnd);
25097 // Zero-extend the offset
25098 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25099 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25102 .addImm(X86::sub_32bit);
25104 // Add the offset to the reg_save_area to get the final address.
25105 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25106 .addReg(OffsetReg64)
25107 .addReg(RegSaveReg);
25109 // Compute the offset for the next argument
25110 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25111 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25113 .addImm(UseFPOffset ? 16 : 8);
25115 // Store it back into the va_list.
25116 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25120 .addDisp(Disp, UseFPOffset ? 4 : 0)
25122 .addReg(NextOffsetReg)
25123 .setMemRefs(MMOBegin, MMOEnd);
25126 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25131 // Emit code to use overflow area
25134 // Load the overflow_area address into a register.
25135 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25136 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25142 .setMemRefs(MMOBegin, MMOEnd);
25144 // If we need to align it, do so. Otherwise, just copy the address
25145 // to OverflowDestReg.
25147 // Align the overflow address
25148 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25149 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25151 // aligned_addr = (addr + (align-1)) & ~(align-1)
25152 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25153 .addReg(OverflowAddrReg)
25156 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25158 .addImm(~(uint64_t)(Align-1));
25160 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25161 .addReg(OverflowAddrReg);
25164 // Compute the next overflow address after this argument.
25165 // (the overflow address should be kept 8-byte aligned)
25166 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25167 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25168 .addReg(OverflowDestReg)
25169 .addImm(ArgSizeA8);
25171 // Store the new overflow address.
25172 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25178 .addReg(NextAddrReg)
25179 .setMemRefs(MMOBegin, MMOEnd);
25181 // If we branched, emit the PHI to the front of endMBB.
25183 BuildMI(*endMBB, endMBB->begin(), DL,
25184 TII->get(X86::PHI), DestReg)
25185 .addReg(OffsetDestReg).addMBB(offsetMBB)
25186 .addReg(OverflowDestReg).addMBB(overflowMBB);
25189 // Erase the pseudo instruction
25190 MI.eraseFromParent();
25195 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25196 MachineInstr &MI, MachineBasicBlock *MBB) const {
25197 // Emit code to save XMM registers to the stack. The ABI says that the
25198 // number of registers to save is given in %al, so it's theoretically
25199 // possible to do an indirect jump trick to avoid saving all of them,
25200 // however this code takes a simpler approach and just executes all
25201 // of the stores if %al is non-zero. It's less code, and it's probably
25202 // easier on the hardware branch predictor, and stores aren't all that
25203 // expensive anyway.
25205 // Create the new basic blocks. One block contains all the XMM stores,
25206 // and one block is the final destination regardless of whether any
25207 // stores were performed.
25208 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25209 MachineFunction *F = MBB->getParent();
25210 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25211 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25212 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25213 F->insert(MBBIter, XMMSaveMBB);
25214 F->insert(MBBIter, EndMBB);
25216 // Transfer the remainder of MBB and its successor edges to EndMBB.
25217 EndMBB->splice(EndMBB->begin(), MBB,
25218 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25219 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25221 // The original block will now fall through to the XMM save block.
25222 MBB->addSuccessor(XMMSaveMBB);
25223 // The XMMSaveMBB will fall through to the end block.
25224 XMMSaveMBB->addSuccessor(EndMBB);
25226 // Now add the instructions.
25227 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25228 DebugLoc DL = MI.getDebugLoc();
25230 unsigned CountReg = MI.getOperand(0).getReg();
25231 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25232 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25234 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25235 // If %al is 0, branch around the XMM save block.
25236 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25237 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25238 MBB->addSuccessor(EndMBB);
25241 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25242 // that was just emitted, but clearly shouldn't be "saved".
25243 assert((MI.getNumOperands() <= 3 ||
25244 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25245 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25246 "Expected last argument to be EFLAGS");
25247 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25248 // In the XMM save block, save all the XMM argument registers.
25249 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25250 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25251 MachineMemOperand *MMO = F->getMachineMemOperand(
25252 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25253 MachineMemOperand::MOStore,
25254 /*Size=*/16, /*Align=*/16);
25255 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25256 .addFrameIndex(RegSaveFrameIndex)
25257 .addImm(/*Scale=*/1)
25258 .addReg(/*IndexReg=*/0)
25259 .addImm(/*Disp=*/Offset)
25260 .addReg(/*Segment=*/0)
25261 .addReg(MI.getOperand(i).getReg())
25262 .addMemOperand(MMO);
25265 MI.eraseFromParent(); // The pseudo instruction is gone now.
25270 // The EFLAGS operand of SelectItr might be missing a kill marker
25271 // because there were multiple uses of EFLAGS, and ISel didn't know
25272 // which to mark. Figure out whether SelectItr should have had a
25273 // kill marker, and set it if it should. Returns the correct kill
25275 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25276 MachineBasicBlock* BB,
25277 const TargetRegisterInfo* TRI) {
25278 // Scan forward through BB for a use/def of EFLAGS.
25279 MachineBasicBlock::iterator miI(std::next(SelectItr));
25280 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25281 const MachineInstr& mi = *miI;
25282 if (mi.readsRegister(X86::EFLAGS))
25284 if (mi.definesRegister(X86::EFLAGS))
25285 break; // Should have kill-flag - update below.
25288 // If we hit the end of the block, check whether EFLAGS is live into a
25290 if (miI == BB->end()) {
25291 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25292 sEnd = BB->succ_end();
25293 sItr != sEnd; ++sItr) {
25294 MachineBasicBlock* succ = *sItr;
25295 if (succ->isLiveIn(X86::EFLAGS))
25300 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25301 // out. SelectMI should have a kill flag on EFLAGS.
25302 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25306 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25307 // together with other CMOV pseudo-opcodes into a single basic-block with
25308 // conditional jump around it.
25309 static bool isCMOVPseudo(MachineInstr &MI) {
25310 switch (MI.getOpcode()) {
25311 case X86::CMOV_FR32:
25312 case X86::CMOV_FR64:
25313 case X86::CMOV_GR8:
25314 case X86::CMOV_GR16:
25315 case X86::CMOV_GR32:
25316 case X86::CMOV_RFP32:
25317 case X86::CMOV_RFP64:
25318 case X86::CMOV_RFP80:
25319 case X86::CMOV_V2F64:
25320 case X86::CMOV_V2I64:
25321 case X86::CMOV_V4F32:
25322 case X86::CMOV_V4F64:
25323 case X86::CMOV_V4I64:
25324 case X86::CMOV_V16F32:
25325 case X86::CMOV_V8F32:
25326 case X86::CMOV_V8F64:
25327 case X86::CMOV_V8I64:
25328 case X86::CMOV_V8I1:
25329 case X86::CMOV_V16I1:
25330 case X86::CMOV_V32I1:
25331 case X86::CMOV_V64I1:
25339 MachineBasicBlock *
25340 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25341 MachineBasicBlock *BB) const {
25342 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25343 DebugLoc DL = MI.getDebugLoc();
25345 // To "insert" a SELECT_CC instruction, we actually have to insert the
25346 // diamond control-flow pattern. The incoming instruction knows the
25347 // destination vreg to set, the condition code register to branch on, the
25348 // true/false values to select between, and a branch opcode to use.
25349 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25350 MachineFunction::iterator It = ++BB->getIterator();
25355 // cmpTY ccX, r1, r2
25357 // fallthrough --> copy0MBB
25358 MachineBasicBlock *thisMBB = BB;
25359 MachineFunction *F = BB->getParent();
25361 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25362 // as described above, by inserting a BB, and then making a PHI at the join
25363 // point to select the true and false operands of the CMOV in the PHI.
25365 // The code also handles two different cases of multiple CMOV opcodes
25369 // In this case, there are multiple CMOVs in a row, all which are based on
25370 // the same condition setting (or the exact opposite condition setting).
25371 // In this case we can lower all the CMOVs using a single inserted BB, and
25372 // then make a number of PHIs at the join point to model the CMOVs. The only
25373 // trickiness here, is that in a case like:
25375 // t2 = CMOV cond1 t1, f1
25376 // t3 = CMOV cond1 t2, f2
25378 // when rewriting this into PHIs, we have to perform some renaming on the
25379 // temps since you cannot have a PHI operand refer to a PHI result earlier
25380 // in the same block. The "simple" but wrong lowering would be:
25382 // t2 = PHI t1(BB1), f1(BB2)
25383 // t3 = PHI t2(BB1), f2(BB2)
25385 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25386 // renaming is to note that on the path through BB1, t2 is really just a
25387 // copy of t1, and do that renaming, properly generating:
25389 // t2 = PHI t1(BB1), f1(BB2)
25390 // t3 = PHI t1(BB1), f2(BB2)
25392 // Case 2, we lower cascaded CMOVs such as
25394 // (CMOV (CMOV F, T, cc1), T, cc2)
25396 // to two successive branches. For that, we look for another CMOV as the
25397 // following instruction.
25399 // Without this, we would add a PHI between the two jumps, which ends up
25400 // creating a few copies all around. For instance, for
25402 // (sitofp (zext (fcmp une)))
25404 // we would generate:
25406 // ucomiss %xmm1, %xmm0
25407 // movss <1.0f>, %xmm0
25408 // movaps %xmm0, %xmm1
25410 // xorps %xmm1, %xmm1
25413 // movaps %xmm1, %xmm0
25417 // because this custom-inserter would have generated:
25429 // A: X = ...; Y = ...
25431 // C: Z = PHI [X, A], [Y, B]
25433 // E: PHI [X, C], [Z, D]
25435 // If we lower both CMOVs in a single step, we can instead generate:
25447 // A: X = ...; Y = ...
25449 // E: PHI [X, A], [X, C], [Y, D]
25451 // Which, in our sitofp/fcmp example, gives us something like:
25453 // ucomiss %xmm1, %xmm0
25454 // movss <1.0f>, %xmm0
25457 // xorps %xmm0, %xmm0
25461 MachineInstr *CascadedCMOV = nullptr;
25462 MachineInstr *LastCMOV = &MI;
25463 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25464 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25465 MachineBasicBlock::iterator NextMIIt =
25466 std::next(MachineBasicBlock::iterator(MI));
25468 // Check for case 1, where there are multiple CMOVs with the same condition
25469 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
25470 // number of jumps the most.
25472 if (isCMOVPseudo(MI)) {
25473 // See if we have a string of CMOVS with the same condition.
25474 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25475 (NextMIIt->getOperand(3).getImm() == CC ||
25476 NextMIIt->getOperand(3).getImm() == OppCC)) {
25477 LastCMOV = &*NextMIIt;
25482 // This checks for case 2, but only do this if we didn't already find
25483 // case 1, as indicated by LastCMOV == MI.
25484 if (LastCMOV == &MI && NextMIIt != BB->end() &&
25485 NextMIIt->getOpcode() == MI.getOpcode() &&
25486 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25487 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25488 NextMIIt->getOperand(1).isKill()) {
25489 CascadedCMOV = &*NextMIIt;
25492 MachineBasicBlock *jcc1MBB = nullptr;
25494 // If we have a cascaded CMOV, we lower it to two successive branches to
25495 // the same block. EFLAGS is used by both, so mark it as live in the second.
25496 if (CascadedCMOV) {
25497 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25498 F->insert(It, jcc1MBB);
25499 jcc1MBB->addLiveIn(X86::EFLAGS);
25502 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25503 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25504 F->insert(It, copy0MBB);
25505 F->insert(It, sinkMBB);
25507 // If the EFLAGS register isn't dead in the terminator, then claim that it's
25508 // live into the sink and copy blocks.
25509 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25511 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25512 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25513 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25514 copy0MBB->addLiveIn(X86::EFLAGS);
25515 sinkMBB->addLiveIn(X86::EFLAGS);
25518 // Transfer the remainder of BB and its successor edges to sinkMBB.
25519 sinkMBB->splice(sinkMBB->begin(), BB,
25520 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25521 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25523 // Add the true and fallthrough blocks as its successors.
25524 if (CascadedCMOV) {
25525 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25526 BB->addSuccessor(jcc1MBB);
25528 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25529 // jump to the sinkMBB.
25530 jcc1MBB->addSuccessor(copy0MBB);
25531 jcc1MBB->addSuccessor(sinkMBB);
25533 BB->addSuccessor(copy0MBB);
25536 // The true block target of the first (or only) branch is always sinkMBB.
25537 BB->addSuccessor(sinkMBB);
25539 // Create the conditional branch instruction.
25540 unsigned Opc = X86::GetCondBranchFromCond(CC);
25541 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25543 if (CascadedCMOV) {
25544 unsigned Opc2 = X86::GetCondBranchFromCond(
25545 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25546 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25550 // %FalseValue = ...
25551 // # fallthrough to sinkMBB
25552 copy0MBB->addSuccessor(sinkMBB);
25555 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25557 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25558 MachineBasicBlock::iterator MIItEnd =
25559 std::next(MachineBasicBlock::iterator(LastCMOV));
25560 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25561 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25562 MachineInstrBuilder MIB;
25564 // As we are creating the PHIs, we have to be careful if there is more than
25565 // one. Later CMOVs may reference the results of earlier CMOVs, but later
25566 // PHIs have to reference the individual true/false inputs from earlier PHIs.
25567 // That also means that PHI construction must work forward from earlier to
25568 // later, and that the code must maintain a mapping from earlier PHI's
25569 // destination registers, and the registers that went into the PHI.
25571 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25572 unsigned DestReg = MIIt->getOperand(0).getReg();
25573 unsigned Op1Reg = MIIt->getOperand(1).getReg();
25574 unsigned Op2Reg = MIIt->getOperand(2).getReg();
25576 // If this CMOV we are generating is the opposite condition from
25577 // the jump we generated, then we have to swap the operands for the
25578 // PHI that is going to be generated.
25579 if (MIIt->getOperand(3).getImm() == OppCC)
25580 std::swap(Op1Reg, Op2Reg);
25582 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25583 Op1Reg = RegRewriteTable[Op1Reg].first;
25585 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25586 Op2Reg = RegRewriteTable[Op2Reg].second;
25588 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25589 TII->get(X86::PHI), DestReg)
25590 .addReg(Op1Reg).addMBB(copy0MBB)
25591 .addReg(Op2Reg).addMBB(thisMBB);
25593 // Add this PHI to the rewrite table.
25594 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25597 // If we have a cascaded CMOV, the second Jcc provides the same incoming
25598 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25599 if (CascadedCMOV) {
25600 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25601 // Copy the PHI result to the register defined by the second CMOV.
25602 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25603 DL, TII->get(TargetOpcode::COPY),
25604 CascadedCMOV->getOperand(0).getReg())
25605 .addReg(MI.getOperand(0).getReg());
25606 CascadedCMOV->eraseFromParent();
25609 // Now remove the CMOV(s).
25610 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25611 (MIIt++)->eraseFromParent();
25616 MachineBasicBlock *
25617 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25618 MachineBasicBlock *BB) const {
25619 // Combine the following atomic floating-point modification pattern:
25620 // a.store(reg OP a.load(acquire), release)
25621 // Transform them into:
25622 // OPss (%gpr), %xmm
25623 // movss %xmm, (%gpr)
25624 // Or sd equivalent for 64-bit operations.
25626 switch (MI.getOpcode()) {
25627 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25628 case X86::RELEASE_FADD32mr:
25629 FOp = X86::ADDSSrm;
25630 MOp = X86::MOVSSmr;
25632 case X86::RELEASE_FADD64mr:
25633 FOp = X86::ADDSDrm;
25634 MOp = X86::MOVSDmr;
25637 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25638 DebugLoc DL = MI.getDebugLoc();
25639 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25640 unsigned ValOpIdx = X86::AddrNumOperands;
25641 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25642 MachineInstrBuilder MIB =
25643 BuildMI(*BB, MI, DL, TII->get(FOp),
25644 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25646 for (int i = 0; i < X86::AddrNumOperands; ++i) {
25647 MachineOperand &Operand = MI.getOperand(i);
25648 // Clear any kill flags on register operands as we'll create a second
25649 // instruction using the same address operands.
25650 if (Operand.isReg())
25651 Operand.setIsKill(false);
25654 MachineInstr *FOpMI = MIB;
25655 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25656 for (int i = 0; i < X86::AddrNumOperands; ++i)
25657 MIB.add(MI.getOperand(i));
25658 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25659 MI.eraseFromParent(); // The pseudo instruction is gone now.
25663 MachineBasicBlock *
25664 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25665 MachineBasicBlock *BB) const {
25666 MachineFunction *MF = BB->getParent();
25667 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25668 DebugLoc DL = MI.getDebugLoc();
25669 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25671 assert(MF->shouldSplitStack());
25673 const bool Is64Bit = Subtarget.is64Bit();
25674 const bool IsLP64 = Subtarget.isTarget64BitLP64();
25676 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25677 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25680 // ... [Till the alloca]
25681 // If stacklet is not large enough, jump to mallocMBB
25684 // Allocate by subtracting from RSP
25685 // Jump to continueMBB
25688 // Allocate by call to runtime
25692 // [rest of original BB]
25695 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25696 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25697 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25699 MachineRegisterInfo &MRI = MF->getRegInfo();
25700 const TargetRegisterClass *AddrRegClass =
25701 getRegClassFor(getPointerTy(MF->getDataLayout()));
25703 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25704 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25705 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25706 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25707 sizeVReg = MI.getOperand(1).getReg(),
25709 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25711 MachineFunction::iterator MBBIter = ++BB->getIterator();
25713 MF->insert(MBBIter, bumpMBB);
25714 MF->insert(MBBIter, mallocMBB);
25715 MF->insert(MBBIter, continueMBB);
25717 continueMBB->splice(continueMBB->begin(), BB,
25718 std::next(MachineBasicBlock::iterator(MI)), BB->end());
25719 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
25721 // Add code to the main basic block to check if the stack limit has been hit,
25722 // and if so, jump to mallocMBB otherwise to bumpMBB.
25723 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
25724 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
25725 .addReg(tmpSPVReg).addReg(sizeVReg);
25726 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
25727 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
25728 .addReg(SPLimitVReg);
25729 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
25731 // bumpMBB simply decreases the stack pointer, since we know the current
25732 // stacklet has enough space.
25733 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
25734 .addReg(SPLimitVReg);
25735 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
25736 .addReg(SPLimitVReg);
25737 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25739 // Calls into a routine in libgcc to allocate more space from the heap.
25740 const uint32_t *RegMask =
25741 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
25743 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
25745 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25746 .addExternalSymbol("__morestack_allocate_stack_space")
25747 .addRegMask(RegMask)
25748 .addReg(X86::RDI, RegState::Implicit)
25749 .addReg(X86::RAX, RegState::ImplicitDefine);
25750 } else if (Is64Bit) {
25751 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
25753 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25754 .addExternalSymbol("__morestack_allocate_stack_space")
25755 .addRegMask(RegMask)
25756 .addReg(X86::EDI, RegState::Implicit)
25757 .addReg(X86::EAX, RegState::ImplicitDefine);
25759 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
25761 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
25762 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
25763 .addExternalSymbol("__morestack_allocate_stack_space")
25764 .addRegMask(RegMask)
25765 .addReg(X86::EAX, RegState::ImplicitDefine);
25769 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
25772 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
25773 .addReg(IsLP64 ? X86::RAX : X86::EAX);
25774 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25776 // Set up the CFG correctly.
25777 BB->addSuccessor(bumpMBB);
25778 BB->addSuccessor(mallocMBB);
25779 mallocMBB->addSuccessor(continueMBB);
25780 bumpMBB->addSuccessor(continueMBB);
25782 // Take care of the PHI nodes.
25783 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
25784 MI.getOperand(0).getReg())
25785 .addReg(mallocPtrVReg)
25787 .addReg(bumpSPPtrVReg)
25790 // Delete the original pseudo instruction.
25791 MI.eraseFromParent();
25794 return continueMBB;
25797 MachineBasicBlock *
25798 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
25799 MachineBasicBlock *BB) const {
25800 MachineFunction *MF = BB->getParent();
25801 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25802 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
25803 DebugLoc DL = MI.getDebugLoc();
25805 assert(!isAsynchronousEHPersonality(
25806 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
25807 "SEH does not use catchret!");
25809 // Only 32-bit EH needs to worry about manually restoring stack pointers.
25810 if (!Subtarget.is32Bit())
25813 // C++ EH creates a new target block to hold the restore code, and wires up
25814 // the new block to the return destination with a normal JMP_4.
25815 MachineBasicBlock *RestoreMBB =
25816 MF->CreateMachineBasicBlock(BB->getBasicBlock());
25817 assert(BB->succ_size() == 1);
25818 MF->insert(std::next(BB->getIterator()), RestoreMBB);
25819 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
25820 BB->addSuccessor(RestoreMBB);
25821 MI.getOperand(0).setMBB(RestoreMBB);
25823 auto RestoreMBBI = RestoreMBB->begin();
25824 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
25825 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25829 MachineBasicBlock *
25830 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25831 MachineBasicBlock *BB) const {
25832 MachineFunction *MF = BB->getParent();
25833 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25834 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25835 // Only 32-bit SEH requires special handling for catchpad.
25836 if (IsSEH && Subtarget.is32Bit()) {
25837 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25838 DebugLoc DL = MI.getDebugLoc();
25839 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
25841 MI.eraseFromParent();
25845 MachineBasicBlock *
25846 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
25847 MachineBasicBlock *BB) const {
25848 // So, here we replace TLSADDR with the sequence:
25849 // adjust_stackdown -> TLSADDR -> adjust_stackup.
25850 // We need this because TLSADDR is lowered into calls
25851 // inside MC, therefore without the two markers shrink-wrapping
25852 // may push the prologue/epilogue pass them.
25853 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25854 DebugLoc DL = MI.getDebugLoc();
25855 MachineFunction &MF = *BB->getParent();
25857 // Emit CALLSEQ_START right before the instruction.
25858 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
25859 MachineInstrBuilder CallseqStart =
25860 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
25861 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
25863 // Emit CALLSEQ_END right after the instruction.
25864 // We don't call erase from parent because we want to keep the
25865 // original instruction around.
25866 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
25867 MachineInstrBuilder CallseqEnd =
25868 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
25869 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
25874 MachineBasicBlock *
25875 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
25876 MachineBasicBlock *BB) const {
25877 // This is pretty easy. We're taking the value that we received from
25878 // our load from the relocation, sticking it in either RDI (x86-64)
25879 // or EAX and doing an indirect call. The return value will then
25880 // be in the normal return register.
25881 MachineFunction *F = BB->getParent();
25882 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25883 DebugLoc DL = MI.getDebugLoc();
25885 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
25886 assert(MI.getOperand(3).isGlobal() && "This should be a global");
25888 // Get a register mask for the lowered call.
25889 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
25890 // proper register mask.
25891 const uint32_t *RegMask =
25892 Subtarget.is64Bit() ?
25893 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
25894 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
25895 if (Subtarget.is64Bit()) {
25896 MachineInstrBuilder MIB =
25897 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
25901 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25902 MI.getOperand(3).getTargetFlags())
25904 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
25905 addDirectMem(MIB, X86::RDI);
25906 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
25907 } else if (!isPositionIndependent()) {
25908 MachineInstrBuilder MIB =
25909 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25913 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25914 MI.getOperand(3).getTargetFlags())
25916 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25917 addDirectMem(MIB, X86::EAX);
25918 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25920 MachineInstrBuilder MIB =
25921 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25922 .addReg(TII->getGlobalBaseReg(F))
25925 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25926 MI.getOperand(3).getTargetFlags())
25928 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25929 addDirectMem(MIB, X86::EAX);
25930 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25933 MI.eraseFromParent(); // The pseudo instruction is gone now.
25937 MachineBasicBlock *
25938 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
25939 MachineBasicBlock *MBB) const {
25940 DebugLoc DL = MI.getDebugLoc();
25941 MachineFunction *MF = MBB->getParent();
25942 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25943 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25944 MachineRegisterInfo &MRI = MF->getRegInfo();
25946 const BasicBlock *BB = MBB->getBasicBlock();
25947 MachineFunction::iterator I = ++MBB->getIterator();
25949 // Memory Reference
25950 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25951 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25954 unsigned MemOpndSlot = 0;
25956 unsigned CurOp = 0;
25958 DstReg = MI.getOperand(CurOp++).getReg();
25959 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25960 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
25962 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25963 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
25965 MemOpndSlot = CurOp;
25967 MVT PVT = getPointerTy(MF->getDataLayout());
25968 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25969 "Invalid Pointer Size!");
25971 // For v = setjmp(buf), we generate
25974 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
25975 // SjLjSetup restoreMBB
25981 // v = phi(main, restore)
25984 // if base pointer being used, load it from frame
25987 MachineBasicBlock *thisMBB = MBB;
25988 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25989 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25990 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
25991 MF->insert(I, mainMBB);
25992 MF->insert(I, sinkMBB);
25993 MF->push_back(restoreMBB);
25994 restoreMBB->setHasAddressTaken();
25996 MachineInstrBuilder MIB;
25998 // Transfer the remainder of BB and its successor edges to sinkMBB.
25999 sinkMBB->splice(sinkMBB->begin(), MBB,
26000 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26001 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26004 unsigned PtrStoreOpc = 0;
26005 unsigned LabelReg = 0;
26006 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26007 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26008 !isPositionIndependent();
26010 // Prepare IP either in reg or imm.
26011 if (!UseImmLabel) {
26012 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26013 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26014 LabelReg = MRI.createVirtualRegister(PtrRC);
26015 if (Subtarget.is64Bit()) {
26016 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26020 .addMBB(restoreMBB)
26023 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26024 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26025 .addReg(XII->getGlobalBaseReg(MF))
26028 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26032 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26034 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26035 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26036 if (i == X86::AddrDisp)
26037 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26039 MIB.add(MI.getOperand(MemOpndSlot + i));
26042 MIB.addReg(LabelReg);
26044 MIB.addMBB(restoreMBB);
26045 MIB.setMemRefs(MMOBegin, MMOEnd);
26047 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26048 .addMBB(restoreMBB);
26050 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26051 MIB.addRegMask(RegInfo->getNoPreservedMask());
26052 thisMBB->addSuccessor(mainMBB);
26053 thisMBB->addSuccessor(restoreMBB);
26057 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26058 mainMBB->addSuccessor(sinkMBB);
26061 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26062 TII->get(X86::PHI), DstReg)
26063 .addReg(mainDstReg).addMBB(mainMBB)
26064 .addReg(restoreDstReg).addMBB(restoreMBB);
26067 if (RegInfo->hasBasePointer(*MF)) {
26068 const bool Uses64BitFramePtr =
26069 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26070 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26071 X86FI->setRestoreBasePointer(MF);
26072 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26073 unsigned BasePtr = RegInfo->getBaseRegister();
26074 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26075 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26076 FramePtr, true, X86FI->getRestoreBasePointerOffset())
26077 .setMIFlag(MachineInstr::FrameSetup);
26079 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26080 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26081 restoreMBB->addSuccessor(sinkMBB);
26083 MI.eraseFromParent();
26087 MachineBasicBlock *
26088 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26089 MachineBasicBlock *MBB) const {
26090 DebugLoc DL = MI.getDebugLoc();
26091 MachineFunction *MF = MBB->getParent();
26092 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26093 MachineRegisterInfo &MRI = MF->getRegInfo();
26095 // Memory Reference
26096 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26097 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26099 MVT PVT = getPointerTy(MF->getDataLayout());
26100 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26101 "Invalid Pointer Size!");
26103 const TargetRegisterClass *RC =
26104 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26105 unsigned Tmp = MRI.createVirtualRegister(RC);
26106 // Since FP is only updated here but NOT referenced, it's treated as GPR.
26107 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26108 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26109 unsigned SP = RegInfo->getStackRegister();
26111 MachineInstrBuilder MIB;
26113 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26114 const int64_t SPOffset = 2 * PVT.getStoreSize();
26116 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26117 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26120 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26121 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26122 MIB.add(MI.getOperand(i));
26123 MIB.setMemRefs(MMOBegin, MMOEnd);
26125 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26126 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26127 if (i == X86::AddrDisp)
26128 MIB.addDisp(MI.getOperand(i), LabelOffset);
26130 MIB.add(MI.getOperand(i));
26132 MIB.setMemRefs(MMOBegin, MMOEnd);
26134 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26135 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26136 if (i == X86::AddrDisp)
26137 MIB.addDisp(MI.getOperand(i), SPOffset);
26139 MIB.add(MI.getOperand(i));
26141 MIB.setMemRefs(MMOBegin, MMOEnd);
26143 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26145 MI.eraseFromParent();
26149 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26150 MachineBasicBlock *MBB,
26151 MachineBasicBlock *DispatchBB,
26153 DebugLoc DL = MI.getDebugLoc();
26154 MachineFunction *MF = MBB->getParent();
26155 MachineRegisterInfo *MRI = &MF->getRegInfo();
26156 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26158 MVT PVT = getPointerTy(MF->getDataLayout());
26159 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26164 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26165 !isPositionIndependent();
26168 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26170 const TargetRegisterClass *TRC =
26171 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26172 VR = MRI->createVirtualRegister(TRC);
26173 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26175 if (Subtarget.is64Bit())
26176 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26180 .addMBB(DispatchBB)
26183 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26184 .addReg(0) /* TII->getGlobalBaseReg(MF) */
26187 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26191 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26192 addFrameReference(MIB, FI, 36);
26194 MIB.addMBB(DispatchBB);
26199 MachineBasicBlock *
26200 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26201 MachineBasicBlock *BB) const {
26202 DebugLoc DL = MI.getDebugLoc();
26203 MachineFunction *MF = BB->getParent();
26204 MachineFrameInfo &MFI = MF->getFrameInfo();
26205 MachineRegisterInfo *MRI = &MF->getRegInfo();
26206 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26207 int FI = MFI.getFunctionContextIndex();
26209 // Get a mapping of the call site numbers to all of the landing pads they're
26210 // associated with.
26211 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26212 unsigned MaxCSNum = 0;
26213 for (auto &MBB : *MF) {
26214 if (!MBB.isEHPad())
26217 MCSymbol *Sym = nullptr;
26218 for (const auto &MI : MBB) {
26219 if (MI.isDebugValue())
26222 assert(MI.isEHLabel() && "expected EH_LABEL");
26223 Sym = MI.getOperand(0).getMCSymbol();
26227 if (!MF->hasCallSiteLandingPad(Sym))
26230 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26231 CallSiteNumToLPad[CSI].push_back(&MBB);
26232 MaxCSNum = std::max(MaxCSNum, CSI);
26236 // Get an ordered list of the machine basic blocks for the jump table.
26237 std::vector<MachineBasicBlock *> LPadList;
26238 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26239 LPadList.reserve(CallSiteNumToLPad.size());
26241 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26242 for (auto &LP : CallSiteNumToLPad[CSI]) {
26243 LPadList.push_back(LP);
26244 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26248 assert(!LPadList.empty() &&
26249 "No landing pad destinations for the dispatch jump table!");
26251 // Create the MBBs for the dispatch code.
26253 // Shove the dispatch's address into the return slot in the function context.
26254 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26255 DispatchBB->setIsEHPad(true);
26257 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26258 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26259 DispatchBB->addSuccessor(TrapBB);
26261 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26262 DispatchBB->addSuccessor(DispContBB);
26265 MF->push_back(DispatchBB);
26266 MF->push_back(DispContBB);
26267 MF->push_back(TrapBB);
26269 // Insert code into the entry block that creates and registers the function
26271 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26273 // Create the jump table and associated information
26274 MachineJumpTableInfo *JTI =
26275 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26276 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26278 const X86RegisterInfo &RI = TII->getRegisterInfo();
26279 // Add a register mask with no preserved registers. This results in all
26280 // registers being marked as clobbered.
26281 if (RI.hasBasePointer(*MF)) {
26282 const bool FPIs64Bit =
26283 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26284 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26285 MFI->setRestoreBasePointer(MF);
26287 unsigned FP = RI.getFrameRegister(*MF);
26288 unsigned BP = RI.getBaseRegister();
26289 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26290 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26291 MFI->getRestoreBasePointerOffset())
26292 .addRegMask(RI.getNoPreservedMask());
26294 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26295 .addRegMask(RI.getNoPreservedMask());
26298 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26299 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26301 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26303 .addImm(LPadList.size());
26304 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26306 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26307 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26310 BuildMI(DispContBB, DL,
26311 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26313 .addImm(Subtarget.is64Bit() ? 8 : 4)
26315 .addJumpTableIndex(MJTI)
26318 // Add the jump table entries as successors to the MBB.
26319 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26320 for (auto &LP : LPadList)
26321 if (SeenMBBs.insert(LP).second)
26322 DispContBB->addSuccessor(LP);
26324 // N.B. the order the invoke BBs are processed in doesn't matter here.
26325 SmallVector<MachineBasicBlock *, 64> MBBLPads;
26326 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26327 for (MachineBasicBlock *MBB : InvokeBBs) {
26328 // Remove the landing pad successor from the invoke block and replace it
26329 // with the new dispatch block.
26330 // Keep a copy of Successors since it's modified inside the loop.
26331 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26333 // FIXME: Avoid quadratic complexity.
26334 for (auto MBBS : Successors) {
26335 if (MBBS->isEHPad()) {
26336 MBB->removeSuccessor(MBBS);
26337 MBBLPads.push_back(MBBS);
26341 MBB->addSuccessor(DispatchBB);
26343 // Find the invoke call and mark all of the callee-saved registers as
26344 // 'implicit defined' so that they're spilled. This prevents code from
26345 // moving instructions to before the EH block, where they will never be
26347 for (auto &II : reverse(*MBB)) {
26351 DenseMap<unsigned, bool> DefRegs;
26352 for (auto &MOp : II.operands())
26354 DefRegs[MOp.getReg()] = true;
26356 MachineInstrBuilder MIB(*MF, &II);
26357 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26358 unsigned Reg = SavedRegs[RI];
26360 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26367 // Mark all former landing pads as non-landing pads. The dispatch is the only
26368 // landing pad now.
26369 for (auto &LP : MBBLPads)
26370 LP->setIsEHPad(false);
26372 // The instruction is gone now.
26373 MI.eraseFromParent();
26377 MachineBasicBlock *
26378 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26379 MachineBasicBlock *BB) const {
26380 MachineFunction *MF = BB->getParent();
26381 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26382 DebugLoc DL = MI.getDebugLoc();
26384 switch (MI.getOpcode()) {
26385 default: llvm_unreachable("Unexpected instr type to insert");
26386 case X86::TAILJMPd64:
26387 case X86::TAILJMPr64:
26388 case X86::TAILJMPm64:
26389 case X86::TAILJMPr64_REX:
26390 case X86::TAILJMPm64_REX:
26391 llvm_unreachable("TAILJMP64 would not be touched here.");
26392 case X86::TCRETURNdi64:
26393 case X86::TCRETURNri64:
26394 case X86::TCRETURNmi64:
26396 case X86::TLS_addr32:
26397 case X86::TLS_addr64:
26398 case X86::TLS_base_addr32:
26399 case X86::TLS_base_addr64:
26400 return EmitLoweredTLSAddr(MI, BB);
26401 case X86::CATCHRET:
26402 return EmitLoweredCatchRet(MI, BB);
26403 case X86::CATCHPAD:
26404 return EmitLoweredCatchPad(MI, BB);
26405 case X86::SEG_ALLOCA_32:
26406 case X86::SEG_ALLOCA_64:
26407 return EmitLoweredSegAlloca(MI, BB);
26408 case X86::TLSCall_32:
26409 case X86::TLSCall_64:
26410 return EmitLoweredTLSCall(MI, BB);
26411 case X86::CMOV_FR32:
26412 case X86::CMOV_FR64:
26413 case X86::CMOV_FR128:
26414 case X86::CMOV_GR8:
26415 case X86::CMOV_GR16:
26416 case X86::CMOV_GR32:
26417 case X86::CMOV_RFP32:
26418 case X86::CMOV_RFP64:
26419 case X86::CMOV_RFP80:
26420 case X86::CMOV_V2F64:
26421 case X86::CMOV_V2I64:
26422 case X86::CMOV_V4F32:
26423 case X86::CMOV_V4F64:
26424 case X86::CMOV_V4I64:
26425 case X86::CMOV_V16F32:
26426 case X86::CMOV_V8F32:
26427 case X86::CMOV_V8F64:
26428 case X86::CMOV_V8I64:
26429 case X86::CMOV_V8I1:
26430 case X86::CMOV_V16I1:
26431 case X86::CMOV_V32I1:
26432 case X86::CMOV_V64I1:
26433 return EmitLoweredSelect(MI, BB);
26435 case X86::RDFLAGS32:
26436 case X86::RDFLAGS64: {
26438 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26439 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26440 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26441 // Permit reads of the FLAGS register without it being defined.
26442 // This intrinsic exists to read external processor state in flags, such as
26443 // the trap flag, interrupt flag, and direction flag, none of which are
26444 // modeled by the backend.
26445 Push->getOperand(2).setIsUndef();
26446 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26448 MI.eraseFromParent(); // The pseudo is gone now.
26452 case X86::WRFLAGS32:
26453 case X86::WRFLAGS64: {
26455 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26457 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26458 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26459 BuildMI(*BB, MI, DL, TII->get(PopF));
26461 MI.eraseFromParent(); // The pseudo is gone now.
26465 case X86::RELEASE_FADD32mr:
26466 case X86::RELEASE_FADD64mr:
26467 return EmitLoweredAtomicFP(MI, BB);
26469 case X86::FP32_TO_INT16_IN_MEM:
26470 case X86::FP32_TO_INT32_IN_MEM:
26471 case X86::FP32_TO_INT64_IN_MEM:
26472 case X86::FP64_TO_INT16_IN_MEM:
26473 case X86::FP64_TO_INT32_IN_MEM:
26474 case X86::FP64_TO_INT64_IN_MEM:
26475 case X86::FP80_TO_INT16_IN_MEM:
26476 case X86::FP80_TO_INT32_IN_MEM:
26477 case X86::FP80_TO_INT64_IN_MEM: {
26478 // Change the floating point control register to use "round towards zero"
26479 // mode when truncating to an integer value.
26480 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26481 addFrameReference(BuildMI(*BB, MI, DL,
26482 TII->get(X86::FNSTCW16m)), CWFrameIdx);
26484 // Load the old value of the high byte of the control word...
26486 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26487 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26490 // Set the high part to be round to zero...
26491 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26494 // Reload the modified control word now...
26495 addFrameReference(BuildMI(*BB, MI, DL,
26496 TII->get(X86::FLDCW16m)), CWFrameIdx);
26498 // Restore the memory image of control word to original value
26499 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26502 // Get the X86 opcode to use.
26504 switch (MI.getOpcode()) {
26505 default: llvm_unreachable("illegal opcode!");
26506 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26507 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26508 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26509 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26510 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26511 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26512 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26513 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26514 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26517 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26518 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26519 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26521 // Reload the original control word now.
26522 addFrameReference(BuildMI(*BB, MI, DL,
26523 TII->get(X86::FLDCW16m)), CWFrameIdx);
26525 MI.eraseFromParent(); // The pseudo instruction is gone now.
26528 // String/text processing lowering.
26529 case X86::PCMPISTRM128REG:
26530 case X86::VPCMPISTRM128REG:
26531 case X86::PCMPISTRM128MEM:
26532 case X86::VPCMPISTRM128MEM:
26533 case X86::PCMPESTRM128REG:
26534 case X86::VPCMPESTRM128REG:
26535 case X86::PCMPESTRM128MEM:
26536 case X86::VPCMPESTRM128MEM:
26537 assert(Subtarget.hasSSE42() &&
26538 "Target must have SSE4.2 or AVX features enabled");
26539 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26541 // String/text processing lowering.
26542 case X86::PCMPISTRIREG:
26543 case X86::VPCMPISTRIREG:
26544 case X86::PCMPISTRIMEM:
26545 case X86::VPCMPISTRIMEM:
26546 case X86::PCMPESTRIREG:
26547 case X86::VPCMPESTRIREG:
26548 case X86::PCMPESTRIMEM:
26549 case X86::VPCMPESTRIMEM:
26550 assert(Subtarget.hasSSE42() &&
26551 "Target must have SSE4.2 or AVX features enabled");
26552 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26554 // Thread synchronization.
26556 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26557 case X86::MONITORX:
26558 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26562 return emitClzero(&MI, BB, Subtarget);
26566 return emitWRPKRU(MI, BB, Subtarget);
26568 return emitRDPKRU(MI, BB, Subtarget);
26571 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26573 case X86::VASTART_SAVE_XMM_REGS:
26574 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26576 case X86::VAARG_64:
26577 return EmitVAARG64WithCustomInserter(MI, BB);
26579 case X86::EH_SjLj_SetJmp32:
26580 case X86::EH_SjLj_SetJmp64:
26581 return emitEHSjLjSetJmp(MI, BB);
26583 case X86::EH_SjLj_LongJmp32:
26584 case X86::EH_SjLj_LongJmp64:
26585 return emitEHSjLjLongJmp(MI, BB);
26587 case X86::Int_eh_sjlj_setup_dispatch:
26588 return EmitSjLjDispatchBlock(MI, BB);
26590 case TargetOpcode::STATEPOINT:
26591 // As an implementation detail, STATEPOINT shares the STACKMAP format at
26592 // this point in the process. We diverge later.
26593 return emitPatchPoint(MI, BB);
26595 case TargetOpcode::STACKMAP:
26596 case TargetOpcode::PATCHPOINT:
26597 return emitPatchPoint(MI, BB);
26599 case TargetOpcode::PATCHABLE_EVENT_CALL:
26600 // Do nothing here, handle in xray instrumentation pass.
26603 case X86::LCMPXCHG8B: {
26604 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26605 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26606 // requires a memory operand. If it happens that current architecture is
26607 // i686 and for current function we need a base pointer
26608 // - which is ESI for i686 - register allocator would not be able to
26609 // allocate registers for an address in form of X(%reg, %reg, Y)
26610 // - there never would be enough unreserved registers during regalloc
26611 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26612 // We are giving a hand to register allocator by precomputing the address in
26613 // a new vreg using LEA.
26615 // If it is not i686 or there is no base pointer - nothing to do here.
26616 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26619 // Even though this code does not necessarily needs the base pointer to
26620 // be ESI, we check for that. The reason: if this assert fails, there are
26621 // some changes happened in the compiler base pointer handling, which most
26622 // probably have to be addressed somehow here.
26623 assert(TRI->getBaseRegister() == X86::ESI &&
26624 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26625 "base pointer in mind");
26627 MachineRegisterInfo &MRI = MF->getRegInfo();
26628 MVT SPTy = getPointerTy(MF->getDataLayout());
26629 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26630 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26632 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26633 // Regalloc does not need any help when the memory operand of CMPXCHG8B
26634 // does not use index register.
26635 if (AM.IndexReg == X86::NoRegister)
26638 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26639 // four operand definitions that are E[ABCD] registers. We skip them and
26640 // then insert the LEA.
26641 MachineBasicBlock::iterator MBBI(MI);
26642 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26643 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26646 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26648 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26652 case X86::LCMPXCHG16B:
26654 case X86::LCMPXCHG8B_SAVE_EBX:
26655 case X86::LCMPXCHG16B_SAVE_RBX: {
26657 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26658 if (!BB->isLiveIn(BasePtr))
26659 BB->addLiveIn(BasePtr);
26665 //===----------------------------------------------------------------------===//
26666 // X86 Optimization Hooks
26667 //===----------------------------------------------------------------------===//
26669 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26671 const APInt &DemandedElts,
26672 const SelectionDAG &DAG,
26673 unsigned Depth) const {
26674 unsigned BitWidth = Known.getBitWidth();
26675 unsigned Opc = Op.getOpcode();
26676 EVT VT = Op.getValueType();
26677 assert((Opc >= ISD::BUILTIN_OP_END ||
26678 Opc == ISD::INTRINSIC_WO_CHAIN ||
26679 Opc == ISD::INTRINSIC_W_CHAIN ||
26680 Opc == ISD::INTRINSIC_VOID) &&
26681 "Should use MaskedValueIsZero if you don't know whether Op"
26682 " is a target node!");
26698 // These nodes' second result is a boolean.
26699 if (Op.getResNo() == 0)
26702 case X86ISD::SETCC:
26703 Known.Zero.setBitsFrom(1);
26705 case X86ISD::MOVMSK: {
26706 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26707 Known.Zero.setBitsFrom(NumLoBits);
26710 case X86ISD::VSHLI:
26711 case X86ISD::VSRLI: {
26712 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26713 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
26714 Known.setAllZero();
26718 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
26719 unsigned ShAmt = ShiftImm->getZExtValue();
26720 if (Opc == X86ISD::VSHLI) {
26721 Known.Zero <<= ShAmt;
26722 Known.One <<= ShAmt;
26723 // Low bits are known zero.
26724 Known.Zero.setLowBits(ShAmt);
26726 Known.Zero.lshrInPlace(ShAmt);
26727 Known.One.lshrInPlace(ShAmt);
26728 // High bits are known zero.
26729 Known.Zero.setHighBits(ShAmt);
26734 case X86ISD::VZEXT: {
26735 SDValue N0 = Op.getOperand(0);
26736 unsigned NumElts = VT.getVectorNumElements();
26738 EVT SrcVT = N0.getValueType();
26739 unsigned InNumElts = SrcVT.getVectorNumElements();
26740 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
26741 assert(InNumElts >= NumElts && "Illegal VZEXT input");
26743 Known = KnownBits(InBitWidth);
26744 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
26745 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
26746 Known = Known.zext(BitWidth);
26747 Known.Zero.setBitsFrom(InBitWidth);
26753 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
26754 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
26755 unsigned Depth) const {
26756 unsigned VTBits = Op.getScalarValueSizeInBits();
26757 unsigned Opcode = Op.getOpcode();
26759 case X86ISD::SETCC_CARRY:
26760 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
26763 case X86ISD::VSEXT: {
26764 SDValue Src = Op.getOperand(0);
26765 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26766 Tmp += VTBits - Src.getScalarValueSizeInBits();
26770 case X86ISD::VSRAI: {
26771 SDValue Src = Op.getOperand(0);
26772 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26773 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
26775 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
26778 case X86ISD::PCMPGT:
26779 case X86ISD::PCMPEQ:
26781 case X86ISD::VPCOM:
26782 case X86ISD::VPCOMU:
26783 // Vector compares return zero/all-bits result values.
26791 /// Returns true (and the GlobalValue and the offset) if the node is a
26792 /// GlobalAddress + offset.
26793 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
26794 const GlobalValue* &GA,
26795 int64_t &Offset) const {
26796 if (N->getOpcode() == X86ISD::Wrapper) {
26797 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
26798 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
26799 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
26803 return TargetLowering::isGAPlusOffset(N, GA, Offset);
26806 // Attempt to match a combined shuffle mask against supported unary shuffle
26808 // TODO: Investigate sharing more of this with shuffle lowering.
26809 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26810 bool AllowFloatDomain, bool AllowIntDomain,
26811 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
26812 const X86Subtarget &Subtarget,
26813 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
26814 unsigned NumMaskElts = Mask.size();
26815 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
26817 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
26818 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
26819 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
26820 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
26821 unsigned MaxScale = 64 / MaskEltSize;
26822 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
26824 unsigned NumDstElts = NumMaskElts / Scale;
26825 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
26826 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
26827 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
26830 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
26831 SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
26832 if (SrcVT != MaskVT)
26833 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
26834 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
26835 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
26836 Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
26837 : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
26843 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
26844 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
26845 isUndefOrEqual(Mask[0], 0) &&
26846 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
26847 Shuffle = X86ISD::VZEXT_MOVL;
26848 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
26852 // Check if we have SSE3 which will let us use MOVDDUP etc. The
26853 // instructions are no slower than UNPCKLPD but has the option to
26854 // fold the input operand into even an unaligned memory load.
26855 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
26856 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
26857 Shuffle = X86ISD::MOVDDUP;
26858 SrcVT = DstVT = MVT::v2f64;
26861 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26862 Shuffle = X86ISD::MOVSLDUP;
26863 SrcVT = DstVT = MVT::v4f32;
26866 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
26867 Shuffle = X86ISD::MOVSHDUP;
26868 SrcVT = DstVT = MVT::v4f32;
26873 if (MaskVT.is256BitVector() && AllowFloatDomain) {
26874 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
26875 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26876 Shuffle = X86ISD::MOVDDUP;
26877 SrcVT = DstVT = MVT::v4f64;
26880 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26881 Shuffle = X86ISD::MOVSLDUP;
26882 SrcVT = DstVT = MVT::v8f32;
26885 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
26886 Shuffle = X86ISD::MOVSHDUP;
26887 SrcVT = DstVT = MVT::v8f32;
26892 if (MaskVT.is512BitVector() && AllowFloatDomain) {
26893 assert(Subtarget.hasAVX512() &&
26894 "AVX512 required for 512-bit vector shuffles");
26895 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26896 Shuffle = X86ISD::MOVDDUP;
26897 SrcVT = DstVT = MVT::v8f64;
26900 if (isTargetShuffleEquivalent(
26901 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
26902 Shuffle = X86ISD::MOVSLDUP;
26903 SrcVT = DstVT = MVT::v16f32;
26906 if (isTargetShuffleEquivalent(
26907 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
26908 Shuffle = X86ISD::MOVSHDUP;
26909 SrcVT = DstVT = MVT::v16f32;
26914 // Attempt to match against broadcast-from-vector.
26915 if (Subtarget.hasAVX2()) {
26916 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
26917 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
26918 SrcVT = DstVT = MaskVT;
26919 Shuffle = X86ISD::VBROADCAST;
26927 // Attempt to match a combined shuffle mask against supported unary immediate
26928 // permute instructions.
26929 // TODO: Investigate sharing more of this with shuffle lowering.
26930 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26931 bool AllowFloatDomain,
26932 bool AllowIntDomain,
26933 const X86Subtarget &Subtarget,
26934 unsigned &Shuffle, MVT &ShuffleVT,
26935 unsigned &PermuteImm) {
26936 unsigned NumMaskElts = Mask.size();
26938 bool ContainsZeros = false;
26939 APInt Zeroable(NumMaskElts, false);
26940 for (unsigned i = 0; i != NumMaskElts; ++i) {
26942 if (isUndefOrZero(M))
26943 Zeroable.setBit(i);
26944 ContainsZeros |= (M == SM_SentinelZero);
26947 // Attempt to match against byte/bit shifts.
26948 // FIXME: Add 512-bit support.
26949 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26950 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26951 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
26952 MaskVT.getScalarSizeInBits(), Mask,
26953 0, Zeroable, Subtarget);
26954 if (0 < ShiftAmt) {
26955 PermuteImm = (unsigned)ShiftAmt;
26960 // Ensure we don't contain any zero elements.
26964 assert(llvm::all_of(Mask, [&](int M) {
26965 return SM_SentinelUndef <= M && M < (int)NumMaskElts;
26966 }) && "Expected unary shuffle");
26968 unsigned InputSizeInBits = MaskVT.getSizeInBits();
26969 unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
26970 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
26972 // Handle PSHUFLW/PSHUFHW repeated patterns.
26973 if (MaskScalarSizeInBits == 16) {
26974 SmallVector<int, 4> RepeatedMask;
26975 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
26976 ArrayRef<int> LoMask(Mask.data() + 0, 4);
26977 ArrayRef<int> HiMask(Mask.data() + 4, 4);
26979 // PSHUFLW: permute lower 4 elements only.
26980 if (isUndefOrInRange(LoMask, 0, 4) &&
26981 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
26982 Shuffle = X86ISD::PSHUFLW;
26983 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26984 PermuteImm = getV4X86ShuffleImm(LoMask);
26988 // PSHUFHW: permute upper 4 elements only.
26989 if (isUndefOrInRange(HiMask, 4, 8) &&
26990 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
26991 // Offset the HiMask so that we can create the shuffle immediate.
26992 int OffsetHiMask[4];
26993 for (int i = 0; i != 4; ++i)
26994 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
26996 Shuffle = X86ISD::PSHUFHW;
26997 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26998 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
27007 // We only support permutation of 32/64 bit elements after this.
27008 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
27011 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
27012 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27013 if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
27016 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
27017 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
27018 AllowFloatDomain = true;
27019 AllowIntDomain = false;
27022 // Check for lane crossing permutes.
27023 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
27024 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27025 if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
27026 Shuffle = X86ISD::VPERMI;
27027 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
27028 PermuteImm = getV4X86ShuffleImm(Mask);
27031 if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
27032 SmallVector<int, 4> RepeatedMask;
27033 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
27034 Shuffle = X86ISD::VPERMI;
27035 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
27036 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27043 // VPERMILPD can permute with a non-repeating shuffle.
27044 if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
27045 Shuffle = X86ISD::VPERMILPI;
27046 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27048 for (int i = 0, e = Mask.size(); i != e; ++i) {
27050 if (M == SM_SentinelUndef)
27052 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27053 PermuteImm |= (M & 1) << i;
27058 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
27059 SmallVector<int, 4> RepeatedMask;
27060 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
27063 // Narrow the repeated mask for 32-bit element permutes.
27064 SmallVector<int, 4> WordMask = RepeatedMask;
27065 if (MaskScalarSizeInBits == 64)
27066 scaleShuffleMask(2, RepeatedMask, WordMask);
27068 Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
27069 ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32);
27070 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27071 PermuteImm = getV4X86ShuffleImm(WordMask);
27075 // Attempt to match a combined unary shuffle mask against supported binary
27076 // shuffle instructions.
27077 // TODO: Investigate sharing more of this with shuffle lowering.
27078 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27079 bool AllowFloatDomain, bool AllowIntDomain,
27080 SDValue &V1, SDValue &V2, SDLoc &DL,
27082 const X86Subtarget &Subtarget,
27083 unsigned &Shuffle, MVT &ShuffleVT,
27085 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27087 if (MaskVT.is128BitVector()) {
27088 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27090 Shuffle = X86ISD::MOVLHPS;
27091 ShuffleVT = MVT::v4f32;
27094 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27096 Shuffle = X86ISD::MOVHLPS;
27097 ShuffleVT = MVT::v4f32;
27100 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27101 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27103 Shuffle = X86ISD::MOVSD;
27104 ShuffleVT = MaskVT;
27107 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27108 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27109 Shuffle = X86ISD::MOVSS;
27110 ShuffleVT = MaskVT;
27115 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27116 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
27117 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27118 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
27119 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
27120 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27121 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27123 ShuffleVT = MaskVT;
27124 if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27125 ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27133 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27134 bool AllowFloatDomain,
27135 bool AllowIntDomain,
27136 SDValue &V1, SDValue &V2, SDLoc &DL,
27138 const X86Subtarget &Subtarget,
27139 unsigned &Shuffle, MVT &ShuffleVT,
27140 unsigned &PermuteImm) {
27141 unsigned NumMaskElts = Mask.size();
27142 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27144 // Attempt to match against PALIGNR byte rotate.
27145 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27146 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27147 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27148 if (0 < ByteRotation) {
27149 Shuffle = X86ISD::PALIGNR;
27150 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27151 PermuteImm = ByteRotation;
27156 // Attempt to combine to X86ISD::BLENDI.
27157 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27158 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27159 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27160 uint64_t BlendMask = 0;
27161 bool ForceV1Zero = false, ForceV2Zero = false;
27162 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27163 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27165 if (MaskVT == MVT::v16i16) {
27166 // We can only use v16i16 PBLENDW if the lanes are repeated.
27167 SmallVector<int, 8> RepeatedMask;
27168 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27170 assert(RepeatedMask.size() == 8 &&
27171 "Repeated mask size doesn't match!");
27173 for (int i = 0; i < 8; ++i)
27174 if (RepeatedMask[i] >= 8)
27175 PermuteImm |= 1 << i;
27176 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27177 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27178 Shuffle = X86ISD::BLENDI;
27179 ShuffleVT = MaskVT;
27183 // Determine a type compatible with X86ISD::BLENDI.
27184 ShuffleVT = MaskVT;
27185 if (Subtarget.hasAVX2()) {
27186 if (ShuffleVT == MVT::v4i64)
27187 ShuffleVT = MVT::v8i32;
27188 else if (ShuffleVT == MVT::v2i64)
27189 ShuffleVT = MVT::v4i32;
27191 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27192 ShuffleVT = MVT::v8i16;
27193 else if (ShuffleVT == MVT::v4i64)
27194 ShuffleVT = MVT::v4f64;
27195 else if (ShuffleVT == MVT::v8i32)
27196 ShuffleVT = MVT::v8f32;
27199 if (!ShuffleVT.isFloatingPoint()) {
27200 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27202 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27203 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27204 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27207 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27208 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27209 PermuteImm = (unsigned)BlendMask;
27210 Shuffle = X86ISD::BLENDI;
27216 // Attempt to combine to INSERTPS.
27217 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27218 MaskVT.is128BitVector()) {
27219 APInt Zeroable(4, 0);
27220 for (unsigned i = 0; i != NumMaskElts; ++i)
27222 Zeroable.setBit(i);
27224 if (Zeroable.getBoolValue() &&
27225 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27226 Shuffle = X86ISD::INSERTPS;
27227 ShuffleVT = MVT::v4f32;
27232 // Attempt to combine to SHUFPD.
27233 if (AllowFloatDomain && EltSizeInBits == 64 &&
27234 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27235 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27236 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27237 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27238 Shuffle = X86ISD::SHUFP;
27239 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27244 // Attempt to combine to SHUFPS.
27245 if (AllowFloatDomain && EltSizeInBits == 32 &&
27246 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
27247 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27248 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27249 SmallVector<int, 4> RepeatedMask;
27250 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27251 // Match each half of the repeated mask, to determine if its just
27252 // referencing one of the vectors, is zeroable or entirely undef.
27253 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27254 int M0 = RepeatedMask[Offset];
27255 int M1 = RepeatedMask[Offset + 1];
27257 if (isUndefInRange(RepeatedMask, Offset, 2)) {
27258 return DAG.getUNDEF(MaskVT);
27259 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27260 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27261 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27262 return getZeroVector(MaskVT, Subtarget, DAG, DL);
27263 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27264 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27265 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27267 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27268 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27269 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27276 int ShufMask[4] = {-1, -1, -1, -1};
27277 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27278 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27283 Shuffle = X86ISD::SHUFP;
27284 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27285 PermuteImm = getV4X86ShuffleImm(ShufMask);
27294 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
27297 /// This is the leaf of the recursive combine below. When we have found some
27298 /// chain of single-use x86 shuffle instructions and accumulated the combined
27299 /// shuffle mask represented by them, this will try to pattern match that mask
27300 /// into either a single instruction if there is a special purpose instruction
27301 /// for this operation, or into a PSHUFB instruction which is a fully general
27302 /// instruction but should only be used to replace chains over a certain depth.
27303 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27304 ArrayRef<int> BaseMask, int Depth,
27305 bool HasVariableMask, SelectionDAG &DAG,
27306 TargetLowering::DAGCombinerInfo &DCI,
27307 const X86Subtarget &Subtarget) {
27308 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27309 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27310 "Unexpected number of shuffle inputs!");
27312 // Find the inputs that enter the chain. Note that multiple uses are OK
27313 // here, we're not going to remove the operands we find.
27314 bool UnaryShuffle = (Inputs.size() == 1);
27315 SDValue V1 = peekThroughBitcasts(Inputs[0]);
27316 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27317 : peekThroughBitcasts(Inputs[1]));
27319 MVT VT1 = V1.getSimpleValueType();
27320 MVT VT2 = V2.getSimpleValueType();
27321 MVT RootVT = Root.getSimpleValueType();
27322 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27323 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27324 "Vector size mismatch");
27329 unsigned NumBaseMaskElts = BaseMask.size();
27330 if (NumBaseMaskElts == 1) {
27331 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27332 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27337 unsigned RootSizeInBits = RootVT.getSizeInBits();
27338 unsigned NumRootElts = RootVT.getVectorNumElements();
27339 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27340 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27341 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
27343 // Don't combine if we are a AVX512/EVEX target and the mask element size
27344 // is different from the root element size - this would prevent writemasks
27345 // from being reused.
27346 // TODO - this currently prevents all lane shuffles from occurring.
27347 // TODO - check for writemasks usage instead of always preventing combining.
27348 // TODO - attempt to narrow Mask back to writemask size.
27349 bool IsEVEXShuffle =
27350 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
27351 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27354 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27356 // Handle 128-bit lane shuffles of 256-bit vectors.
27357 // TODO - this should support binary shuffles.
27358 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27359 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27360 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27361 return false; // Nothing to do!
27362 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27363 unsigned PermMask = 0;
27364 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27365 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27367 Res = DAG.getBitcast(ShuffleVT, V1);
27368 DCI.AddToWorklist(Res.getNode());
27369 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27370 DAG.getUNDEF(ShuffleVT),
27371 DAG.getConstant(PermMask, DL, MVT::i8));
27372 DCI.AddToWorklist(Res.getNode());
27373 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27378 // For masks that have been widened to 128-bit elements or more,
27379 // narrow back down to 64-bit elements.
27380 SmallVector<int, 64> Mask;
27381 if (BaseMaskEltSizeInBits > 64) {
27382 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27383 int MaskScale = BaseMaskEltSizeInBits / 64;
27384 scaleShuffleMask(MaskScale, BaseMask, Mask);
27386 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27389 unsigned NumMaskElts = Mask.size();
27390 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27392 // Determine the effective mask value type.
27393 FloatDomain &= (32 <= MaskEltSizeInBits);
27394 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27395 : MVT::getIntegerVT(MaskEltSizeInBits);
27396 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27398 // Only allow legal mask types.
27399 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27402 // Attempt to match the mask against known shuffle patterns.
27403 MVT ShuffleSrcVT, ShuffleVT;
27404 unsigned Shuffle, PermuteImm;
27406 // Which shuffle domains are permitted?
27407 // Permit domain crossing at higher combine depths.
27408 bool AllowFloatDomain = FloatDomain || (Depth > 3);
27409 bool AllowIntDomain = !FloatDomain || (Depth > 3);
27411 if (UnaryShuffle) {
27412 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27413 // directly if we don't shuffle the lower element and we shuffle the upper
27414 // (zero) elements within themselves.
27415 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27416 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27417 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27418 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27419 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27420 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27421 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27427 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27428 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27430 if (Depth == 1 && Root.getOpcode() == Shuffle)
27431 return false; // Nothing to do!
27432 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27433 return false; // AVX512 Writemask clash.
27434 Res = DAG.getBitcast(ShuffleSrcVT, V1);
27435 DCI.AddToWorklist(Res.getNode());
27436 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27437 DCI.AddToWorklist(Res.getNode());
27438 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27443 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27444 AllowIntDomain, Subtarget, Shuffle,
27445 ShuffleVT, PermuteImm)) {
27446 if (Depth == 1 && Root.getOpcode() == Shuffle)
27447 return false; // Nothing to do!
27448 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27449 return false; // AVX512 Writemask clash.
27450 Res = DAG.getBitcast(ShuffleVT, V1);
27451 DCI.AddToWorklist(Res.getNode());
27452 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27453 DAG.getConstant(PermuteImm, DL, MVT::i8));
27454 DCI.AddToWorklist(Res.getNode());
27455 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27461 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27462 V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27464 if (Depth == 1 && Root.getOpcode() == Shuffle)
27465 return false; // Nothing to do!
27466 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27467 return false; // AVX512 Writemask clash.
27468 V1 = DAG.getBitcast(ShuffleVT, V1);
27469 DCI.AddToWorklist(V1.getNode());
27470 V2 = DAG.getBitcast(ShuffleVT, V2);
27471 DCI.AddToWorklist(V2.getNode());
27472 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27473 DCI.AddToWorklist(Res.getNode());
27474 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27479 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27480 AllowIntDomain, V1, V2, DL, DAG,
27481 Subtarget, Shuffle, ShuffleVT,
27483 if (Depth == 1 && Root.getOpcode() == Shuffle)
27484 return false; // Nothing to do!
27485 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27486 return false; // AVX512 Writemask clash.
27487 V1 = DAG.getBitcast(ShuffleVT, V1);
27488 DCI.AddToWorklist(V1.getNode());
27489 V2 = DAG.getBitcast(ShuffleVT, V2);
27490 DCI.AddToWorklist(V2.getNode());
27491 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27492 DAG.getConstant(PermuteImm, DL, MVT::i8));
27493 DCI.AddToWorklist(Res.getNode());
27494 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27499 // Don't try to re-form single instruction chains under any circumstances now
27500 // that we've done encoding canonicalization for them.
27504 bool MaskContainsZeros =
27505 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27507 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27508 // If we have a single input lane-crossing shuffle then lower to VPERMV.
27509 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27510 ((Subtarget.hasAVX2() &&
27511 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27512 (Subtarget.hasAVX512() &&
27513 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27514 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27515 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27516 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27517 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27518 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27519 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27520 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27521 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27522 DCI.AddToWorklist(VPermMask.getNode());
27523 Res = DAG.getBitcast(MaskVT, V1);
27524 DCI.AddToWorklist(Res.getNode());
27525 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27526 DCI.AddToWorklist(Res.getNode());
27527 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27532 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27533 // vector as the second source.
27534 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27535 ((Subtarget.hasAVX512() &&
27536 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27537 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27538 (Subtarget.hasVLX() &&
27539 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27540 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27541 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27542 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27543 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27544 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27545 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27546 for (unsigned i = 0; i != NumMaskElts; ++i)
27547 if (Mask[i] == SM_SentinelZero)
27548 Mask[i] = NumMaskElts + i;
27550 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27551 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27552 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27553 DCI.AddToWorklist(VPermMask.getNode());
27554 Res = DAG.getBitcast(MaskVT, V1);
27555 DCI.AddToWorklist(Res.getNode());
27556 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27557 DCI.AddToWorklist(Zero.getNode());
27558 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27559 DCI.AddToWorklist(Res.getNode());
27560 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27565 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27566 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27567 ((Subtarget.hasAVX512() &&
27568 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27569 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27570 (Subtarget.hasVLX() &&
27571 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27572 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27573 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27574 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27575 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27576 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27577 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27578 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27579 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27580 DCI.AddToWorklist(VPermMask.getNode());
27581 V1 = DAG.getBitcast(MaskVT, V1);
27582 DCI.AddToWorklist(V1.getNode());
27583 V2 = DAG.getBitcast(MaskVT, V2);
27584 DCI.AddToWorklist(V2.getNode());
27585 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27586 DCI.AddToWorklist(Res.getNode());
27587 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27594 // See if we can combine a single input shuffle with zeros to a bit-mask,
27595 // which is much simpler than any shuffle.
27596 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27597 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27598 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27599 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27600 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27601 APInt UndefElts(NumMaskElts, 0);
27602 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27603 for (unsigned i = 0; i != NumMaskElts; ++i) {
27605 if (M == SM_SentinelUndef) {
27606 UndefElts.setBit(i);
27609 if (M == SM_SentinelZero)
27611 EltBits[i] = AllOnes;
27613 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27614 DCI.AddToWorklist(BitMask.getNode());
27615 Res = DAG.getBitcast(MaskVT, V1);
27616 DCI.AddToWorklist(Res.getNode());
27617 unsigned AndOpcode =
27618 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27619 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27620 DCI.AddToWorklist(Res.getNode());
27621 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27626 // If we have a single input shuffle with different shuffle patterns in the
27627 // the 128-bit lanes use the variable mask to VPERMILPS.
27628 // TODO Combine other mask types at higher depths.
27629 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27630 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27631 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27632 SmallVector<SDValue, 16> VPermIdx;
27633 for (int M : Mask) {
27635 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27636 VPermIdx.push_back(Idx);
27638 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
27639 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
27640 DCI.AddToWorklist(VPermMask.getNode());
27641 Res = DAG.getBitcast(MaskVT, V1);
27642 DCI.AddToWorklist(Res.getNode());
27643 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27644 DCI.AddToWorklist(Res.getNode());
27645 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27650 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27651 // to VPERMIL2PD/VPERMIL2PS.
27652 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27653 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27654 MaskVT == MVT::v8f32)) {
27655 // VPERMIL2 Operation.
27656 // Bits[3] - Match Bit.
27657 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27658 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27659 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27660 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27661 SmallVector<int, 8> VPerm2Idx;
27662 MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
27663 MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
27664 unsigned M2ZImm = 0;
27665 for (int M : Mask) {
27666 if (M == SM_SentinelUndef) {
27667 VPerm2Idx.push_back(-1);
27670 if (M == SM_SentinelZero) {
27672 VPerm2Idx.push_back(8);
27675 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27676 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27677 VPerm2Idx.push_back(Index);
27679 V1 = DAG.getBitcast(MaskVT, V1);
27680 DCI.AddToWorklist(V1.getNode());
27681 V2 = DAG.getBitcast(MaskVT, V2);
27682 DCI.AddToWorklist(V2.getNode());
27683 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
27684 DCI.AddToWorklist(VPerm2MaskOp.getNode());
27685 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27686 DAG.getConstant(M2ZImm, DL, MVT::i8));
27687 DCI.AddToWorklist(Res.getNode());
27688 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27693 // If we have 3 or more shuffle instructions or a chain involving a variable
27694 // mask, we can replace them with a single PSHUFB instruction profitably.
27695 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
27696 // instructions, but in practice PSHUFB tends to be *very* fast so we're
27697 // more aggressive.
27698 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27699 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27700 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
27701 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
27702 SmallVector<SDValue, 16> PSHUFBMask;
27703 int NumBytes = RootVT.getSizeInBits() / 8;
27704 int Ratio = NumBytes / NumMaskElts;
27705 for (int i = 0; i < NumBytes; ++i) {
27706 int M = Mask[i / Ratio];
27707 if (M == SM_SentinelUndef) {
27708 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
27711 if (M == SM_SentinelZero) {
27712 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
27715 M = Ratio * M + i % Ratio;
27716 assert ((M / 16) == (i / 16) && "Lane crossing detected");
27717 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27719 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
27720 Res = DAG.getBitcast(ByteVT, V1);
27721 DCI.AddToWorklist(Res.getNode());
27722 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
27723 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
27724 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
27725 DCI.AddToWorklist(Res.getNode());
27726 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27731 // With XOP, if we have a 128-bit binary input shuffle we can always combine
27732 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
27733 // slower than PSHUFB on targets that support both.
27734 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
27735 Subtarget.hasXOP()) {
27736 // VPPERM Mask Operation
27737 // Bits[4:0] - Byte Index (0 - 31)
27738 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
27739 SmallVector<SDValue, 16> VPPERMMask;
27741 int Ratio = NumBytes / NumMaskElts;
27742 for (int i = 0; i < NumBytes; ++i) {
27743 int M = Mask[i / Ratio];
27744 if (M == SM_SentinelUndef) {
27745 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
27748 if (M == SM_SentinelZero) {
27749 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
27752 M = Ratio * M + i % Ratio;
27753 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27755 MVT ByteVT = MVT::v16i8;
27756 V1 = DAG.getBitcast(ByteVT, V1);
27757 DCI.AddToWorklist(V1.getNode());
27758 V2 = DAG.getBitcast(ByteVT, V2);
27759 DCI.AddToWorklist(V2.getNode());
27760 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
27761 DCI.AddToWorklist(VPPERMMaskOp.getNode());
27762 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
27763 DCI.AddToWorklist(Res.getNode());
27764 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27769 // Failed to find any combines.
27773 // Attempt to constant fold all of the constant source ops.
27774 // Returns true if the entire shuffle is folded to a constant.
27775 // TODO: Extend this to merge multiple constant Ops and update the mask.
27776 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
27777 ArrayRef<int> Mask, SDValue Root,
27778 bool HasVariableMask, SelectionDAG &DAG,
27779 TargetLowering::DAGCombinerInfo &DCI,
27780 const X86Subtarget &Subtarget) {
27781 MVT VT = Root.getSimpleValueType();
27783 unsigned SizeInBits = VT.getSizeInBits();
27784 unsigned NumMaskElts = Mask.size();
27785 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
27786 unsigned NumOps = Ops.size();
27788 // Extract constant bits from each source op.
27789 bool OneUseConstantOp = false;
27790 SmallVector<APInt, 16> UndefEltsOps(NumOps);
27791 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
27792 for (unsigned i = 0; i != NumOps; ++i) {
27793 SDValue SrcOp = Ops[i];
27794 OneUseConstantOp |= SrcOp.hasOneUse();
27795 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
27800 // Only fold if at least one of the constants is only used once or
27801 // the combined shuffle has included a variable mask shuffle, this
27802 // is to avoid constant pool bloat.
27803 if (!OneUseConstantOp && !HasVariableMask)
27806 // Shuffle the constant bits according to the mask.
27807 APInt UndefElts(NumMaskElts, 0);
27808 APInt ZeroElts(NumMaskElts, 0);
27809 APInt ConstantElts(NumMaskElts, 0);
27810 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
27811 APInt::getNullValue(MaskSizeInBits));
27812 for (unsigned i = 0; i != NumMaskElts; ++i) {
27814 if (M == SM_SentinelUndef) {
27815 UndefElts.setBit(i);
27817 } else if (M == SM_SentinelZero) {
27818 ZeroElts.setBit(i);
27821 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
27823 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
27824 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
27826 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
27827 if (SrcUndefElts[SrcMaskIdx]) {
27828 UndefElts.setBit(i);
27832 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
27833 APInt &Bits = SrcEltBits[SrcMaskIdx];
27835 ZeroElts.setBit(i);
27839 ConstantElts.setBit(i);
27840 ConstantBitData[i] = Bits;
27842 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
27844 // Create the constant data.
27846 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
27847 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
27849 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
27851 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
27854 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
27855 DCI.AddToWorklist(CstOp.getNode());
27856 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
27860 /// \brief Fully generic combining of x86 shuffle instructions.
27862 /// This should be the last combine run over the x86 shuffle instructions. Once
27863 /// they have been fully optimized, this will recursively consider all chains
27864 /// of single-use shuffle instructions, build a generic model of the cumulative
27865 /// shuffle operation, and check for simpler instructions which implement this
27866 /// operation. We use this primarily for two purposes:
27868 /// 1) Collapse generic shuffles to specialized single instructions when
27869 /// equivalent. In most cases, this is just an encoding size win, but
27870 /// sometimes we will collapse multiple generic shuffles into a single
27871 /// special-purpose shuffle.
27872 /// 2) Look for sequences of shuffle instructions with 3 or more total
27873 /// instructions, and replace them with the slightly more expensive SSSE3
27874 /// PSHUFB instruction if available. We do this as the last combining step
27875 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
27876 /// a suitable short sequence of other instructions. The PSHUFB will either
27877 /// use a register or have to read from memory and so is slightly (but only
27878 /// slightly) more expensive than the other shuffle instructions.
27880 /// Because this is inherently a quadratic operation (for each shuffle in
27881 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
27882 /// This should never be an issue in practice as the shuffle lowering doesn't
27883 /// produce sequences of more than 8 instructions.
27885 /// FIXME: We will currently miss some cases where the redundant shuffling
27886 /// would simplify under the threshold for PSHUFB formation because of
27887 /// combine-ordering. To fix this, we should do the redundant instruction
27888 /// combining in this recursive walk.
27889 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27890 int SrcOpIndex, SDValue Root,
27891 ArrayRef<int> RootMask,
27892 ArrayRef<const SDNode*> SrcNodes,
27893 int Depth, bool HasVariableMask,
27895 TargetLowering::DAGCombinerInfo &DCI,
27896 const X86Subtarget &Subtarget) {
27897 // Bound the depth of our recursive combine because this is ultimately
27898 // quadratic in nature.
27902 // Directly rip through bitcasts to find the underlying operand.
27903 SDValue Op = SrcOps[SrcOpIndex];
27904 Op = peekThroughOneUseBitcasts(Op);
27906 MVT VT = Op.getSimpleValueType();
27907 if (!VT.isVector())
27908 return false; // Bail if we hit a non-vector.
27910 assert(Root.getSimpleValueType().isVector() &&
27911 "Shuffles operate on vector types!");
27912 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
27913 "Can only combine shuffles of the same vector register size.");
27915 // Extract target shuffle mask and resolve sentinels and inputs.
27916 SmallVector<int, 64> OpMask;
27917 SmallVector<SDValue, 2> OpInputs;
27918 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask))
27921 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
27922 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
27923 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
27925 // Add the inputs to the Ops list, avoiding duplicates.
27926 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
27928 int InputIdx0 = -1, InputIdx1 = -1;
27929 for (int i = 0, e = Ops.size(); i < e; ++i) {
27930 SDValue BC = peekThroughBitcasts(Ops[i]);
27931 if (Input0 && BC == peekThroughBitcasts(Input0))
27933 if (Input1 && BC == peekThroughBitcasts(Input1))
27937 if (Input0 && InputIdx0 < 0) {
27938 InputIdx0 = SrcOpIndex;
27939 Ops[SrcOpIndex] = Input0;
27941 if (Input1 && InputIdx1 < 0) {
27942 InputIdx1 = Ops.size();
27943 Ops.push_back(Input1);
27946 assert(((RootMask.size() > OpMask.size() &&
27947 RootMask.size() % OpMask.size() == 0) ||
27948 (OpMask.size() > RootMask.size() &&
27949 OpMask.size() % RootMask.size() == 0) ||
27950 OpMask.size() == RootMask.size()) &&
27951 "The smaller number of elements must divide the larger.");
27952 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27953 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27954 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27955 assert(((RootRatio == 1 && OpRatio == 1) ||
27956 (RootRatio == 1) != (OpRatio == 1)) &&
27957 "Must not have a ratio for both incoming and op masks!");
27959 SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
27961 // Merge this shuffle operation's mask into our accumulated mask. Note that
27962 // this shuffle's mask will be the first applied to the input, followed by the
27963 // root mask to get us all the way to the root value arrangement. The reason
27964 // for this order is that we are recursing up the operation chain.
27965 for (int i = 0; i < MaskWidth; ++i) {
27966 int RootIdx = i / RootRatio;
27967 if (RootMask[RootIdx] < 0) {
27968 // This is a zero or undef lane, we're done.
27969 Mask[i] = RootMask[RootIdx];
27973 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
27975 // Just insert the scaled root mask value if it references an input other
27976 // than the SrcOp we're currently inserting.
27977 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
27978 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
27979 Mask[i] = RootMaskedIdx;
27983 RootMaskedIdx %= MaskWidth;
27985 int OpIdx = RootMaskedIdx / OpRatio;
27986 if (OpMask[OpIdx] < 0) {
27987 // The incoming lanes are zero or undef, it doesn't matter which ones we
27989 Mask[i] = OpMask[OpIdx];
27993 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
27994 int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
27995 OpMaskedIdx %= MaskWidth;
27997 if (OpMask[OpIdx] < (int)OpMask.size()) {
27998 assert(0 <= InputIdx0 && "Unknown target shuffle input");
27999 OpMaskedIdx += InputIdx0 * MaskWidth;
28001 assert(0 <= InputIdx1 && "Unknown target shuffle input");
28002 OpMaskedIdx += InputIdx1 * MaskWidth;
28005 Mask[i] = OpMaskedIdx;
28008 // Handle the all undef/zero cases early.
28009 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
28010 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
28013 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
28014 // TODO - should we handle the mixed zero/undef case as well? Just returning
28015 // a zero mask will lose information on undef elements possibly reducing
28016 // future combine possibilities.
28017 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
28018 Subtarget, DAG, SDLoc(Root)));
28022 // Remove unused shuffle source ops.
28023 resolveTargetShuffleInputsAndMask(Ops, Mask);
28024 assert(!Ops.empty() && "Shuffle with no inputs detected");
28026 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
28028 // Update the list of shuffle nodes that have been combined so far.
28029 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
28031 CombinedNodes.push_back(Op.getNode());
28033 // See if we can recurse into each shuffle source op (if it's a target
28034 // shuffle). The source op should only be combined if it either has a
28035 // single use (i.e. current Op) or all its users have already been combined.
28036 for (int i = 0, e = Ops.size(); i < e; ++i)
28037 if (Ops[i].getNode()->hasOneUse() ||
28038 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28039 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28040 Depth + 1, HasVariableMask, DAG, DCI,
28044 // Attempt to constant fold all of the constant source ops.
28045 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
28049 // We can only combine unary and binary shuffle mask cases.
28050 if (Ops.size() > 2)
28053 // Minor canonicalization of the accumulated shuffle mask to make it easier
28054 // to match below. All this does is detect masks with sequential pairs of
28055 // elements, and shrink them to the half-width mask. It does this in a loop
28056 // so it will reduce the size of the mask to the minimal width mask which
28057 // performs an equivalent shuffle.
28058 SmallVector<int, 64> WidenedMask;
28059 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
28060 Mask = std::move(WidenedMask);
28063 // Canonicalization of binary shuffle masks to improve pattern matching by
28064 // commuting the inputs.
28065 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
28066 ShuffleVectorSDNode::commuteMask(Mask);
28067 std::swap(Ops[0], Ops[1]);
28070 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
28074 /// \brief Get the PSHUF-style mask from PSHUF node.
28076 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28077 /// PSHUF-style masks that can be reused with such instructions.
28078 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28079 MVT VT = N.getSimpleValueType();
28080 SmallVector<int, 4> Mask;
28081 SmallVector<SDValue, 2> Ops;
28084 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28088 // If we have more than 128-bits, only the low 128-bits of shuffle mask
28089 // matter. Check that the upper masks are repeats and remove them.
28090 if (VT.getSizeInBits() > 128) {
28091 int LaneElts = 128 / VT.getScalarSizeInBits();
28093 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28094 for (int j = 0; j < LaneElts; ++j)
28095 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28096 "Mask doesn't repeat in high 128-bit lanes!");
28098 Mask.resize(LaneElts);
28101 switch (N.getOpcode()) {
28102 case X86ISD::PSHUFD:
28104 case X86ISD::PSHUFLW:
28107 case X86ISD::PSHUFHW:
28108 Mask.erase(Mask.begin(), Mask.begin() + 4);
28109 for (int &M : Mask)
28113 llvm_unreachable("No valid shuffle instruction found!");
28117 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
28119 /// We walk up the chain and look for a combinable shuffle, skipping over
28120 /// shuffles that we could hoist this shuffle's transformation past without
28121 /// altering anything.
28123 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28124 SelectionDAG &DAG) {
28125 assert(N.getOpcode() == X86ISD::PSHUFD &&
28126 "Called with something other than an x86 128-bit half shuffle!");
28129 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28130 // of the shuffles in the chain so that we can form a fresh chain to replace
28132 SmallVector<SDValue, 8> Chain;
28133 SDValue V = N.getOperand(0);
28134 for (; V.hasOneUse(); V = V.getOperand(0)) {
28135 switch (V.getOpcode()) {
28137 return SDValue(); // Nothing combined!
28140 // Skip bitcasts as we always know the type for the target specific
28144 case X86ISD::PSHUFD:
28145 // Found another dword shuffle.
28148 case X86ISD::PSHUFLW:
28149 // Check that the low words (being shuffled) are the identity in the
28150 // dword shuffle, and the high words are self-contained.
28151 if (Mask[0] != 0 || Mask[1] != 1 ||
28152 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28155 Chain.push_back(V);
28158 case X86ISD::PSHUFHW:
28159 // Check that the high words (being shuffled) are the identity in the
28160 // dword shuffle, and the low words are self-contained.
28161 if (Mask[2] != 2 || Mask[3] != 3 ||
28162 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28165 Chain.push_back(V);
28168 case X86ISD::UNPCKL:
28169 case X86ISD::UNPCKH:
28170 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28171 // shuffle into a preceding word shuffle.
28172 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28173 V.getSimpleValueType().getVectorElementType() != MVT::i16)
28176 // Search for a half-shuffle which we can combine with.
28177 unsigned CombineOp =
28178 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28179 if (V.getOperand(0) != V.getOperand(1) ||
28180 !V->isOnlyUserOf(V.getOperand(0).getNode()))
28182 Chain.push_back(V);
28183 V = V.getOperand(0);
28185 switch (V.getOpcode()) {
28187 return SDValue(); // Nothing to combine.
28189 case X86ISD::PSHUFLW:
28190 case X86ISD::PSHUFHW:
28191 if (V.getOpcode() == CombineOp)
28194 Chain.push_back(V);
28198 V = V.getOperand(0);
28202 } while (V.hasOneUse());
28205 // Break out of the loop if we break out of the switch.
28209 if (!V.hasOneUse())
28210 // We fell out of the loop without finding a viable combining instruction.
28213 // Merge this node's mask and our incoming mask.
28214 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28215 for (int &M : Mask)
28217 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28218 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28220 // Rebuild the chain around this new shuffle.
28221 while (!Chain.empty()) {
28222 SDValue W = Chain.pop_back_val();
28224 if (V.getValueType() != W.getOperand(0).getValueType())
28225 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28227 switch (W.getOpcode()) {
28229 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28231 case X86ISD::UNPCKL:
28232 case X86ISD::UNPCKH:
28233 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28236 case X86ISD::PSHUFD:
28237 case X86ISD::PSHUFLW:
28238 case X86ISD::PSHUFHW:
28239 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28243 if (V.getValueType() != N.getValueType())
28244 V = DAG.getBitcast(N.getValueType(), V);
28246 // Return the new chain to replace N.
28250 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28253 /// We walk up the chain, skipping shuffles of the other half and looking
28254 /// through shuffles which switch halves trying to find a shuffle of the same
28255 /// pair of dwords.
28256 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28258 TargetLowering::DAGCombinerInfo &DCI) {
28260 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28261 "Called with something other than an x86 128-bit half shuffle!");
28263 unsigned CombineOpcode = N.getOpcode();
28265 // Walk up a single-use chain looking for a combinable shuffle.
28266 SDValue V = N.getOperand(0);
28267 for (; V.hasOneUse(); V = V.getOperand(0)) {
28268 switch (V.getOpcode()) {
28270 return false; // Nothing combined!
28273 // Skip bitcasts as we always know the type for the target specific
28277 case X86ISD::PSHUFLW:
28278 case X86ISD::PSHUFHW:
28279 if (V.getOpcode() == CombineOpcode)
28282 // Other-half shuffles are no-ops.
28285 // Break out of the loop if we break out of the switch.
28289 if (!V.hasOneUse())
28290 // We fell out of the loop without finding a viable combining instruction.
28293 // Combine away the bottom node as its shuffle will be accumulated into
28294 // a preceding shuffle.
28295 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28297 // Record the old value.
28300 // Merge this node's mask and our incoming mask (adjusted to account for all
28301 // the pshufd instructions encountered).
28302 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28303 for (int &M : Mask)
28305 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28306 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28308 // Check that the shuffles didn't cancel each other out. If not, we need to
28309 // combine to the new one.
28311 // Replace the combinable shuffle with the combined one, updating all users
28312 // so that we re-evaluate the chain here.
28313 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28318 /// \brief Try to combine x86 target specific shuffles.
28319 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28320 TargetLowering::DAGCombinerInfo &DCI,
28321 const X86Subtarget &Subtarget) {
28323 MVT VT = N.getSimpleValueType();
28324 SmallVector<int, 4> Mask;
28326 unsigned Opcode = N.getOpcode();
28328 case X86ISD::PSHUFD:
28329 case X86ISD::PSHUFLW:
28330 case X86ISD::PSHUFHW:
28331 Mask = getPSHUFShuffleMask(N);
28332 assert(Mask.size() == 4);
28334 case X86ISD::UNPCKL: {
28335 auto Op0 = N.getOperand(0);
28336 auto Op1 = N.getOperand(1);
28337 unsigned Opcode0 = Op0.getOpcode();
28338 unsigned Opcode1 = Op1.getOpcode();
28340 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28341 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28342 // TODO: Add other horizontal operations as required.
28343 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28344 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28346 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28347 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28348 // moves upper half elements into the lower half part. For example:
28350 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28352 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28354 // will be combined to:
28356 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28358 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28359 // happen due to advanced instructions.
28360 if (!VT.is128BitVector())
28363 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28364 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28366 unsigned NumElts = VT.getVectorNumElements();
28367 SmallVector<int, 8> ExpectedMask(NumElts, -1);
28368 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28371 auto ShufOp = Op1.getOperand(0);
28372 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28373 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28377 case X86ISD::BLENDI: {
28378 SDValue V0 = N->getOperand(0);
28379 SDValue V1 = N->getOperand(1);
28380 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28381 "Unexpected input vector types");
28383 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28384 // operands and changing the mask to 1. This saves us a bunch of
28385 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28386 // x86InstrInfo knows how to commute this back after instruction selection
28387 // if it would help register allocation.
28389 // TODO: If optimizing for size or a processor that doesn't suffer from
28390 // partial register update stalls, this should be transformed into a MOVSD
28391 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28393 if (VT == MVT::v2f64)
28394 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28395 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28396 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28397 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28402 case X86ISD::MOVSD:
28403 case X86ISD::MOVSS: {
28404 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28405 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28406 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28407 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28408 if (isZero0 && isZero1)
28411 // We often lower to MOVSD/MOVSS from integer as well as native float
28412 // types; remove unnecessary domain-crossing bitcasts if we can to make it
28413 // easier to combine shuffles later on. We've already accounted for the
28414 // domain switching cost when we decided to lower with it.
28415 bool isFloat = VT.isFloatingPoint();
28416 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28417 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28418 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
28419 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28420 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28421 V0 = DAG.getBitcast(NewVT, V0);
28422 V1 = DAG.getBitcast(NewVT, V1);
28423 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28428 case X86ISD::INSERTPS: {
28429 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28430 SDValue Op0 = N.getOperand(0);
28431 SDValue Op1 = N.getOperand(1);
28432 SDValue Op2 = N.getOperand(2);
28433 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28434 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28435 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28436 unsigned ZeroMask = InsertPSMask & 0xF;
28438 // If we zero out all elements from Op0 then we don't need to reference it.
28439 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28440 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28441 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28443 // If we zero out the element from Op1 then we don't need to reference it.
28444 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28445 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28446 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28448 // Attempt to merge insertps Op1 with an inner target shuffle node.
28449 SmallVector<int, 8> TargetMask1;
28450 SmallVector<SDValue, 2> Ops1;
28451 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28452 int M = TargetMask1[SrcIdx];
28453 if (isUndefOrZero(M)) {
28454 // Zero/UNDEF insertion - zero out element and remove dependency.
28455 InsertPSMask |= (1u << DstIdx);
28456 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28457 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28459 // Update insertps mask srcidx and reference the source input directly.
28460 assert(0 <= M && M < 8 && "Shuffle index out of range");
28461 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28462 Op1 = Ops1[M < 4 ? 0 : 1];
28463 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28464 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28467 // Attempt to merge insertps Op0 with an inner target shuffle node.
28468 SmallVector<int, 8> TargetMask0;
28469 SmallVector<SDValue, 2> Ops0;
28470 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28473 bool Updated = false;
28474 bool UseInput00 = false;
28475 bool UseInput01 = false;
28476 for (int i = 0; i != 4; ++i) {
28477 int M = TargetMask0[i];
28478 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
28479 // No change if element is already zero or the inserted element.
28481 } else if (isUndefOrZero(M)) {
28482 // If the target mask is undef/zero then we must zero the element.
28483 InsertPSMask |= (1u << i);
28488 // The input vector element must be inline.
28489 if (M != i && M != (i + 4))
28492 // Determine which inputs of the target shuffle we're using.
28493 UseInput00 |= (0 <= M && M < 4);
28494 UseInput01 |= (4 <= M);
28497 // If we're not using both inputs of the target shuffle then use the
28498 // referenced input directly.
28499 if (UseInput00 && !UseInput01) {
28502 } else if (!UseInput00 && UseInput01) {
28508 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28509 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28517 // Nuke no-op shuffles that show up after combining.
28518 if (isNoopShuffleMask(Mask))
28519 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28521 // Look for simplifications involving one or two shuffle instructions.
28522 SDValue V = N.getOperand(0);
28523 switch (N.getOpcode()) {
28526 case X86ISD::PSHUFLW:
28527 case X86ISD::PSHUFHW:
28528 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28530 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28531 return SDValue(); // We combined away this shuffle, so we're done.
28533 // See if this reduces to a PSHUFD which is no more expensive and can
28534 // combine with more operations. Note that it has to at least flip the
28535 // dwords as otherwise it would have been removed as a no-op.
28536 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28537 int DMask[] = {0, 1, 2, 3};
28538 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28539 DMask[DOffset + 0] = DOffset + 1;
28540 DMask[DOffset + 1] = DOffset + 0;
28541 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28542 V = DAG.getBitcast(DVT, V);
28543 DCI.AddToWorklist(V.getNode());
28544 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28545 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28546 DCI.AddToWorklist(V.getNode());
28547 return DAG.getBitcast(VT, V);
28550 // Look for shuffle patterns which can be implemented as a single unpack.
28551 // FIXME: This doesn't handle the location of the PSHUFD generically, and
28552 // only works when we have a PSHUFD followed by two half-shuffles.
28553 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28554 (V.getOpcode() == X86ISD::PSHUFLW ||
28555 V.getOpcode() == X86ISD::PSHUFHW) &&
28556 V.getOpcode() != N.getOpcode() &&
28558 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28559 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28560 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28561 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28562 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28563 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28565 for (int i = 0; i < 4; ++i) {
28566 WordMask[i + NOffset] = Mask[i] + NOffset;
28567 WordMask[i + VOffset] = VMask[i] + VOffset;
28569 // Map the word mask through the DWord mask.
28571 for (int i = 0; i < 8; ++i)
28572 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28573 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28574 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28575 // We can replace all three shuffles with an unpack.
28576 V = DAG.getBitcast(VT, D.getOperand(0));
28577 DCI.AddToWorklist(V.getNode());
28578 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28587 case X86ISD::PSHUFD:
28588 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28597 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28598 /// operation. If true is returned then the operands of ADDSUB operation
28599 /// are written to the parameters \p Opnd0 and \p Opnd1.
28601 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28602 /// so it is easier to generically match. We also insert dummy vector shuffle
28603 /// nodes for the operands which explicitly discard the lanes which are unused
28604 /// by this operation to try to flow through the rest of the combiner
28605 /// the fact that they're unused.
28606 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28607 SDValue &Opnd0, SDValue &Opnd1) {
28609 EVT VT = N->getValueType(0);
28610 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28611 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28612 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28615 // We only handle target-independent shuffles.
28616 // FIXME: It would be easy and harmless to use the target shuffle mask
28617 // extraction tool to support more.
28618 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28621 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28622 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28624 SDValue V1 = N->getOperand(0);
28625 SDValue V2 = N->getOperand(1);
28627 // We require the first shuffle operand to be the FSUB node, and the second to
28628 // be the FADD node.
28629 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28630 ShuffleVectorSDNode::commuteMask(Mask);
28632 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28635 // If there are other uses of these operations we can't fold them.
28636 if (!V1->hasOneUse() || !V2->hasOneUse())
28639 // Ensure that both operations have the same operands. Note that we can
28640 // commute the FADD operands.
28641 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28642 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28643 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28646 // We're looking for blends between FADD and FSUB nodes. We insist on these
28647 // nodes being lined up in a specific expected pattern.
28648 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28649 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28650 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28651 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28652 8, 25, 10, 27, 12, 29, 14, 31})))
28660 /// \brief Try to combine a shuffle into a target-specific add-sub or
28661 /// mul-add-sub node.
28662 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28663 const X86Subtarget &Subtarget,
28664 SelectionDAG &DAG) {
28665 SDValue Opnd0, Opnd1;
28666 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28669 EVT VT = N->getValueType(0);
28672 // Try to generate X86ISD::FMADDSUB node here.
28674 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28675 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28677 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
28678 // the ADDSUB idiom has been successfully recognized. There are no known
28679 // X86 targets with 512-bit ADDSUB instructions!
28680 if (VT.is512BitVector())
28683 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
28686 // We are looking for a shuffle where both sources are concatenated with undef
28687 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
28688 // if we can express this as a single-source shuffle, that's preferable.
28689 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
28690 const X86Subtarget &Subtarget) {
28691 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
28694 EVT VT = N->getValueType(0);
28696 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
28697 if (!VT.is128BitVector() && !VT.is256BitVector())
28700 if (VT.getVectorElementType() != MVT::i32 &&
28701 VT.getVectorElementType() != MVT::i64 &&
28702 VT.getVectorElementType() != MVT::f32 &&
28703 VT.getVectorElementType() != MVT::f64)
28706 SDValue N0 = N->getOperand(0);
28707 SDValue N1 = N->getOperand(1);
28709 // Check that both sources are concats with undef.
28710 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
28711 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
28712 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
28713 !N1.getOperand(1).isUndef())
28716 // Construct the new shuffle mask. Elements from the first source retain their
28717 // index, but elements from the second source no longer need to skip an undef.
28718 SmallVector<int, 8> Mask;
28719 int NumElts = VT.getVectorNumElements();
28721 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28722 for (int Elt : SVOp->getMask())
28723 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
28726 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
28728 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
28731 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
28732 TargetLowering::DAGCombinerInfo &DCI,
28733 const X86Subtarget &Subtarget) {
28735 EVT VT = N->getValueType(0);
28736 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28737 // If we have legalized the vector types, look for blends of FADD and FSUB
28738 // nodes that we can fuse into an ADDSUB node.
28739 if (TLI.isTypeLegal(VT))
28740 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
28743 // During Type Legalization, when promoting illegal vector types,
28744 // the backend might introduce new shuffle dag nodes and bitcasts.
28746 // This code performs the following transformation:
28747 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
28748 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
28750 // We do this only if both the bitcast and the BINOP dag nodes have
28751 // one use. Also, perform this transformation only if the new binary
28752 // operation is legal. This is to avoid introducing dag nodes that
28753 // potentially need to be further expanded (or custom lowered) into a
28754 // less optimal sequence of dag nodes.
28755 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
28756 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
28757 N->getOperand(0).getOpcode() == ISD::BITCAST &&
28758 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
28759 SDValue N0 = N->getOperand(0);
28760 SDValue N1 = N->getOperand(1);
28762 SDValue BC0 = N0.getOperand(0);
28763 EVT SVT = BC0.getValueType();
28764 unsigned Opcode = BC0.getOpcode();
28765 unsigned NumElts = VT.getVectorNumElements();
28767 if (BC0.hasOneUse() && SVT.isVector() &&
28768 SVT.getVectorNumElements() * 2 == NumElts &&
28769 TLI.isOperationLegal(Opcode, VT)) {
28770 bool CanFold = false;
28776 // isOperationLegal lies for integer ops on floating point types.
28777 CanFold = VT.isInteger();
28782 // isOperationLegal lies for floating point ops on integer types.
28783 CanFold = VT.isFloatingPoint();
28787 unsigned SVTNumElts = SVT.getVectorNumElements();
28788 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28789 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
28790 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
28791 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
28792 CanFold = SVOp->getMaskElt(i) < 0;
28795 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
28796 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
28797 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
28798 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
28803 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
28804 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
28805 // consecutive, non-overlapping, and in the right order.
28806 SmallVector<SDValue, 16> Elts;
28807 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
28808 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
28809 Elts.push_back(Elt);
28816 if (Elts.size() == VT.getVectorNumElements())
28817 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
28820 // For AVX2, we sometimes want to combine
28821 // (vector_shuffle <mask> (concat_vectors t1, undef)
28822 // (concat_vectors t2, undef))
28824 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
28825 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
28826 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
28829 if (isTargetShuffle(N->getOpcode())) {
28831 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
28834 // Try recursively combining arbitrary sequences of x86 shuffle
28835 // instructions into higher-order shuffles. We do this after combining
28836 // specific PSHUF instruction sequences into their minimal form so that we
28837 // can evaluate how many specialized shuffle instructions are involved in
28838 // a particular chain.
28839 SmallVector<int, 1> NonceMask; // Just a placeholder.
28840 NonceMask.push_back(0);
28841 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
28842 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
28844 return SDValue(); // This routine will use CombineTo to replace N.
28850 /// Check if a vector extract from a target-specific shuffle of a load can be
28851 /// folded into a single element load.
28852 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
28853 /// shuffles have been custom lowered so we need to handle those here.
28854 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
28855 TargetLowering::DAGCombinerInfo &DCI) {
28856 if (DCI.isBeforeLegalizeOps())
28859 SDValue InVec = N->getOperand(0);
28860 SDValue EltNo = N->getOperand(1);
28861 EVT EltVT = N->getValueType(0);
28863 if (!isa<ConstantSDNode>(EltNo))
28866 EVT OriginalVT = InVec.getValueType();
28868 // Peek through bitcasts, don't duplicate a load with other uses.
28869 InVec = peekThroughOneUseBitcasts(InVec);
28871 EVT CurrentVT = InVec.getValueType();
28872 if (!CurrentVT.isVector() ||
28873 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
28876 if (!isTargetShuffle(InVec.getOpcode()))
28879 // Don't duplicate a load with other uses.
28880 if (!InVec.hasOneUse())
28883 SmallVector<int, 16> ShuffleMask;
28884 SmallVector<SDValue, 2> ShuffleOps;
28886 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
28887 ShuffleOps, ShuffleMask, UnaryShuffle))
28890 // Select the input vector, guarding against out of range extract vector.
28891 unsigned NumElems = CurrentVT.getVectorNumElements();
28892 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
28893 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
28895 if (Idx == SM_SentinelZero)
28896 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
28897 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
28898 if (Idx == SM_SentinelUndef)
28899 return DAG.getUNDEF(EltVT);
28901 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
28902 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
28905 // If inputs to shuffle are the same for both ops, then allow 2 uses
28906 unsigned AllowedUses =
28907 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
28909 if (LdNode.getOpcode() == ISD::BITCAST) {
28910 // Don't duplicate a load with other uses.
28911 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
28914 AllowedUses = 1; // only allow 1 load use if we have a bitcast
28915 LdNode = LdNode.getOperand(0);
28918 if (!ISD::isNormalLoad(LdNode.getNode()))
28921 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
28923 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
28926 // If there's a bitcast before the shuffle, check if the load type and
28927 // alignment is valid.
28928 unsigned Align = LN0->getAlignment();
28929 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28930 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
28931 EltVT.getTypeForEVT(*DAG.getContext()));
28933 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
28936 // All checks match so transform back to vector_shuffle so that DAG combiner
28937 // can finish the job
28940 // Create shuffle node taking into account the case that its a unary shuffle
28941 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
28942 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
28944 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
28945 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
28949 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
28950 const X86Subtarget &Subtarget) {
28951 SDValue N0 = N->getOperand(0);
28952 EVT VT = N->getValueType(0);
28953 EVT SrcVT = N0.getValueType();
28955 // Since MMX types are special and don't usually play with other vector types,
28956 // it's better to handle them early to be sure we emit efficient code by
28957 // avoiding store-load conversions.
28959 // Detect bitcasts between i32 to x86mmx low word.
28960 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
28961 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
28962 SDValue N00 = N0->getOperand(0);
28963 if (N00.getValueType() == MVT::i32)
28964 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
28967 // Detect bitcasts between element or subvector extraction to x86mmx.
28968 if (VT == MVT::x86mmx &&
28969 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
28970 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
28971 isNullConstant(N0.getOperand(1))) {
28972 SDValue N00 = N0->getOperand(0);
28973 if (N00.getValueType().is128BitVector())
28974 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
28975 DAG.getBitcast(MVT::v2i64, N00));
28978 // Detect bitcasts from FP_TO_SINT to x86mmx.
28979 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
28980 N0.getOpcode() == ISD::FP_TO_SINT) {
28982 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
28983 DAG.getUNDEF(MVT::v2i32));
28984 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
28985 DAG.getBitcast(MVT::v2i64, Res));
28988 // Convert a bitcasted integer logic operation that has one bitcasted
28989 // floating-point operand into a floating-point logic operation. This may
28990 // create a load of a constant, but that is cheaper than materializing the
28991 // constant in an integer register and transferring it to an SSE register or
28992 // transferring the SSE operand to integer register and back.
28994 switch (N0.getOpcode()) {
28995 case ISD::AND: FPOpcode = X86ISD::FAND; break;
28996 case ISD::OR: FPOpcode = X86ISD::FOR; break;
28997 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
28998 default: return SDValue();
29001 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
29002 (Subtarget.hasSSE2() && VT == MVT::f64)))
29005 SDValue LogicOp0 = N0.getOperand(0);
29006 SDValue LogicOp1 = N0.getOperand(1);
29009 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
29010 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
29011 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
29012 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
29013 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29014 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29016 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29017 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
29018 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
29019 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
29020 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29021 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29027 // Match a binop + shuffle pyramid that represents a horizontal reduction over
29028 // the elements of a vector.
29029 // Returns the vector that is being reduced on, or SDValue() if a reduction
29030 // was not matched.
29031 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29032 // The pattern must end in an extract from index 0.
29033 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
29034 !isNullConstant(Extract->getOperand(1)))
29038 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29040 SDValue Op = Extract->getOperand(0);
29041 // At each stage, we're looking for something that looks like:
29042 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29043 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29044 // i32 undef, i32 undef, i32 undef, i32 undef>
29045 // %a = binop <8 x i32> %op, %s
29046 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29047 // we expect something like:
29048 // <4,5,6,7,u,u,u,u>
29049 // <2,3,u,u,u,u,u,u>
29050 // <1,u,u,u,u,u,u,u>
29051 for (unsigned i = 0; i < Stages; ++i) {
29052 if (Op.getOpcode() != BinOp)
29055 ShuffleVectorSDNode *Shuffle =
29056 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29058 Op = Op.getOperand(1);
29060 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29061 Op = Op.getOperand(0);
29064 // The first operand of the shuffle should be the same as the other operand
29066 if (!Shuffle || (Shuffle->getOperand(0) != Op))
29069 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
29070 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
29071 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29078 // Given a select, detect the following pattern:
29079 // 1: %2 = zext <N x i8> %0 to <N x i32>
29080 // 2: %3 = zext <N x i8> %1 to <N x i32>
29081 // 3: %4 = sub nsw <N x i32> %2, %3
29082 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29083 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
29084 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29085 // This is useful as it is the input into a SAD pattern.
29086 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29088 // Check the condition of the select instruction is greater-than.
29089 SDValue SetCC = Select->getOperand(0);
29090 if (SetCC.getOpcode() != ISD::SETCC)
29092 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29093 if (CC != ISD::SETGT && CC != ISD::SETLT)
29096 SDValue SelectOp1 = Select->getOperand(1);
29097 SDValue SelectOp2 = Select->getOperand(2);
29099 // The following instructions assume SelectOp1 is the subtraction operand
29100 // and SelectOp2 is the negation operand.
29101 // In the case of SETLT this is the other way around.
29102 if (CC == ISD::SETLT)
29103 std::swap(SelectOp1, SelectOp2);
29105 // The second operand of the select should be the negation of the first
29106 // operand, which is implemented as 0 - SelectOp1.
29107 if (!(SelectOp2.getOpcode() == ISD::SUB &&
29108 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29109 SelectOp2.getOperand(1) == SelectOp1))
29112 // The first operand of SetCC is the first operand of the select, which is the
29113 // difference between the two input vectors.
29114 if (SetCC.getOperand(0) != SelectOp1)
29117 // In SetLT case, The second operand of the comparison can be either 1 or 0.
29119 if ((CC == ISD::SETLT) &&
29120 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
29122 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29125 // In SetGT case, The second operand of the comparison can be either -1 or 0.
29126 if ((CC == ISD::SETGT) &&
29127 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29128 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29131 // The first operand of the select is the difference between the two input
29133 if (SelectOp1.getOpcode() != ISD::SUB)
29136 Op0 = SelectOp1.getOperand(0);
29137 Op1 = SelectOp1.getOperand(1);
29139 // Check if the operands of the sub are zero-extended from vectors of i8.
29140 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29141 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29142 Op1.getOpcode() != ISD::ZERO_EXTEND ||
29143 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29149 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29151 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29152 const SDValue &Zext1, const SDLoc &DL) {
29154 // Find the appropriate width for the PSADBW.
29155 EVT InVT = Zext0.getOperand(0).getValueType();
29156 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29158 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29159 // fill in the missing vector elements with 0.
29160 unsigned NumConcat = RegSize / InVT.getSizeInBits();
29161 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29162 Ops[0] = Zext0.getOperand(0);
29163 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29164 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29165 Ops[0] = Zext1.getOperand(0);
29166 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29168 // Actually build the SAD
29169 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29170 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29173 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29174 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29176 const X86Subtarget &Subtarget) {
29177 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29178 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
29181 EVT ExtractVT = Extract->getValueType(0);
29182 unsigned BitWidth = ExtractVT.getSizeInBits();
29183 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29184 ExtractVT != MVT::i8)
29187 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29188 for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29189 SDValue Match = matchBinOpReduction(Extract, Op);
29193 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29194 // which we can't support here for now.
29195 if (Match.getScalarValueSizeInBits() != BitWidth)
29198 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29199 unsigned MatchSizeInBits = Match.getValueSizeInBits();
29200 if (!(MatchSizeInBits == 128 ||
29201 (MatchSizeInBits == 256 &&
29202 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
29205 // Don't bother performing this for 2-element vectors.
29206 if (Match.getValueType().getVectorNumElements() <= 2)
29209 // Check that we are extracting a reduction of all sign bits.
29210 if (DAG.ComputeNumSignBits(Match) != BitWidth)
29213 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29215 if (64 == BitWidth || 32 == BitWidth)
29216 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29217 MatchSizeInBits / BitWidth);
29219 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29222 ISD::CondCode CondCode;
29223 if (Op == ISD::OR) {
29224 // any_of -> MOVMSK != 0
29225 CompareBits = APInt::getNullValue(32);
29226 CondCode = ISD::CondCode::SETNE;
29228 // all_of -> MOVMSK == ((1 << NumElts) - 1)
29229 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29230 CondCode = ISD::CondCode::SETEQ;
29233 // Perform the select as i32/i64 and then truncate to avoid partial register
29235 unsigned ResWidth = std::max(BitWidth, 32u);
29236 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29238 SDValue Zero = DAG.getConstant(0, DL, ResVT);
29239 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29240 SDValue Res = DAG.getBitcast(MaskVT, Match);
29241 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29242 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29243 Ones, Zero, CondCode);
29244 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29250 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29251 const X86Subtarget &Subtarget) {
29252 // PSADBW is only supported on SSE2 and up.
29253 if (!Subtarget.hasSSE2())
29256 // Verify the type we're extracting from is any integer type above i16.
29257 EVT VT = Extract->getOperand(0).getValueType();
29258 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
29261 unsigned RegSize = 128;
29262 if (Subtarget.hasBWI())
29264 else if (Subtarget.hasAVX2())
29267 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29268 // TODO: We should be able to handle larger vectors by splitting them before
29269 // feeding them into several SADs, and then reducing over those.
29270 if (RegSize / VT.getVectorNumElements() < 8)
29273 // Match shuffle + add pyramid.
29274 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29276 // The operand is expected to be zero extended from i8
29277 // (verified in detectZextAbsDiff).
29278 // In order to convert to i64 and above, additional any/zero/sign
29279 // extend is expected.
29280 // The zero extend from 32 bit has no mathematical effect on the result.
29281 // Also the sign extend is basically zero extend
29282 // (extends the sign bit which is zero).
29283 // So it is correct to skip the sign/zero extend instruction.
29284 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
29285 Root.getOpcode() == ISD::ZERO_EXTEND ||
29286 Root.getOpcode() == ISD::ANY_EXTEND))
29287 Root = Root.getOperand(0);
29289 // If there was a match, we want Root to be a select that is the root of an
29290 // abs-diff pattern.
29291 if (!Root || (Root.getOpcode() != ISD::VSELECT))
29294 // Check whether we have an abs-diff pattern feeding into the select.
29295 SDValue Zext0, Zext1;
29296 if (!detectZextAbsDiff(Root, Zext0, Zext1))
29299 // Create the SAD instruction.
29301 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29303 // If the original vector was wider than 8 elements, sum over the results
29304 // in the SAD vector.
29305 unsigned Stages = Log2_32(VT.getVectorNumElements());
29306 MVT SadVT = SAD.getSimpleValueType();
29308 unsigned SadElems = SadVT.getVectorNumElements();
29310 for(unsigned i = Stages - 3; i > 0; --i) {
29311 SmallVector<int, 16> Mask(SadElems, -1);
29312 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29313 Mask[j] = MaskEnd + j;
29316 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29317 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29321 MVT Type = Extract->getSimpleValueType(0);
29322 unsigned TypeSizeInBits = Type.getSizeInBits();
29323 // Return the lowest TypeSizeInBits bits.
29324 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29325 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29326 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29327 Extract->getOperand(1));
29330 // Attempt to peek through a target shuffle and extract the scalar from the
29332 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29333 TargetLowering::DAGCombinerInfo &DCI,
29334 const X86Subtarget &Subtarget) {
29335 if (DCI.isBeforeLegalizeOps())
29338 SDValue Src = N->getOperand(0);
29339 SDValue Idx = N->getOperand(1);
29341 EVT VT = N->getValueType(0);
29342 EVT SrcVT = Src.getValueType();
29343 EVT SrcSVT = SrcVT.getVectorElementType();
29344 unsigned NumSrcElts = SrcVT.getVectorNumElements();
29346 // Don't attempt this for boolean mask vectors or unknown extraction indices.
29347 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
29350 // Resolve the target shuffle inputs and mask.
29351 SmallVector<int, 16> Mask;
29352 SmallVector<SDValue, 2> Ops;
29353 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask))
29356 // Attempt to narrow/widen the shuffle mask to the correct size.
29357 if (Mask.size() != NumSrcElts) {
29358 if ((NumSrcElts % Mask.size()) == 0) {
29359 SmallVector<int, 16> ScaledMask;
29360 int Scale = NumSrcElts / Mask.size();
29361 scaleShuffleMask(Scale, Mask, ScaledMask);
29362 Mask = std::move(ScaledMask);
29363 } else if ((Mask.size() % NumSrcElts) == 0) {
29364 SmallVector<int, 16> WidenedMask;
29365 while (Mask.size() > NumSrcElts &&
29366 canWidenShuffleElements(Mask, WidenedMask))
29367 Mask = std::move(WidenedMask);
29368 // TODO - investigate support for wider shuffle masks with known upper
29369 // undef/zero elements for implicit zero-extension.
29373 // Check if narrowing/widening failed.
29374 if (Mask.size() != NumSrcElts)
29377 int SrcIdx = Mask[N->getConstantOperandVal(1)];
29380 // If the shuffle source element is undef/zero then we can just accept it.
29381 if (SrcIdx == SM_SentinelUndef)
29382 return DAG.getUNDEF(VT);
29384 if (SrcIdx == SM_SentinelZero)
29385 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29386 : DAG.getConstant(0, dl, VT);
29388 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29389 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29390 SrcIdx = SrcIdx % Mask.size();
29392 // We can only extract other elements from 128-bit vectors and in certain
29393 // circumstances, depending on SSE-level.
29394 // TODO: Investigate using extract_subvector for larger vectors.
29395 // TODO: Investigate float/double extraction if it will be just stored.
29396 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
29397 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
29398 assert(SrcSVT == VT && "Unexpected extraction type");
29399 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29400 DAG.getIntPtrConstant(SrcIdx, dl));
29403 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
29404 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29405 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29406 "Unexpected extraction type");
29407 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29408 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29409 DAG.getIntPtrConstant(SrcIdx, dl));
29410 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29411 DAG.getValueType(SrcSVT));
29412 return DAG.getZExtOrTrunc(Assert, dl, VT);
29418 /// Detect vector gather/scatter index generation and convert it from being a
29419 /// bunch of shuffles and extracts into a somewhat faster sequence.
29420 /// For i686, the best sequence is apparently storing the value and loading
29421 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
29422 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29423 TargetLowering::DAGCombinerInfo &DCI,
29424 const X86Subtarget &Subtarget) {
29425 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29428 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29431 SDValue InputVector = N->getOperand(0);
29432 SDValue EltIdx = N->getOperand(1);
29434 EVT SrcVT = InputVector.getValueType();
29435 EVT VT = N->getValueType(0);
29436 SDLoc dl(InputVector);
29438 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29439 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29440 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29441 SDValue MMXSrc = InputVector.getOperand(0);
29443 // The bitcast source is a direct mmx result.
29444 if (MMXSrc.getValueType() == MVT::x86mmx)
29445 return DAG.getBitcast(VT, InputVector);
29448 // Detect mmx to i32 conversion through a v2i32 elt extract.
29449 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29450 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29451 SDValue MMXSrc = InputVector.getOperand(0);
29453 // The bitcast source is a direct mmx result.
29454 if (MMXSrc.getValueType() == MVT::x86mmx)
29455 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29458 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29459 isa<ConstantSDNode>(EltIdx) &&
29460 isa<ConstantSDNode>(InputVector.getOperand(0))) {
29461 uint64_t ExtractedElt = N->getConstantOperandVal(1);
29462 uint64_t InputValue = InputVector.getConstantOperandVal(0);
29463 uint64_t Res = (InputValue >> ExtractedElt) & 1;
29464 return DAG.getConstant(Res, dl, MVT::i1);
29467 // Check whether this extract is the root of a sum of absolute differences
29468 // pattern. This has to be done here because we really want it to happen
29469 // pre-legalization,
29470 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29473 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29474 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29477 // Only operate on vectors of 4 elements, where the alternative shuffling
29478 // gets to be more expensive.
29479 if (SrcVT != MVT::v4i32)
29482 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29483 // single use which is a sign-extend or zero-extend, and all elements are
29485 SmallVector<SDNode *, 4> Uses;
29486 unsigned ExtractedElements = 0;
29487 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29488 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29489 if (UI.getUse().getResNo() != InputVector.getResNo())
29492 SDNode *Extract = *UI;
29493 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29496 if (Extract->getValueType(0) != MVT::i32)
29498 if (!Extract->hasOneUse())
29500 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29501 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29503 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29506 // Record which element was extracted.
29507 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29508 Uses.push_back(Extract);
29511 // If not all the elements were used, this may not be worthwhile.
29512 if (ExtractedElements != 15)
29515 // Ok, we've now decided to do the transformation.
29516 // If 64-bit shifts are legal, use the extract-shift sequence,
29517 // otherwise bounce the vector off the cache.
29518 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29521 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29522 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29523 auto &DL = DAG.getDataLayout();
29524 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29525 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29526 DAG.getConstant(0, dl, VecIdxTy));
29527 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29528 DAG.getConstant(1, dl, VecIdxTy));
29530 SDValue ShAmt = DAG.getConstant(
29531 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29532 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29533 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29534 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29535 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29536 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29537 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29539 // Store the value to a temporary stack slot.
29540 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29541 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29542 MachinePointerInfo());
29544 EVT ElementType = SrcVT.getVectorElementType();
29545 unsigned EltSize = ElementType.getSizeInBits() / 8;
29547 // Replace each use (extract) with a load of the appropriate element.
29548 for (unsigned i = 0; i < 4; ++i) {
29549 uint64_t Offset = EltSize * i;
29550 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29551 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29553 SDValue ScalarAddr =
29554 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29556 // Load the scalar.
29558 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29562 // Replace the extracts
29563 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
29564 UE = Uses.end(); UI != UE; ++UI) {
29565 SDNode *Extract = *UI;
29567 uint64_t IdxVal = Extract->getConstantOperandVal(1);
29568 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
29571 // The replacement was made in place; don't return anything.
29575 // TODO - merge with combineExtractVectorElt once it can handle the implicit
29576 // zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
29577 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
29578 // combineBasicSADPattern.
29579 static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
29580 TargetLowering::DAGCombinerInfo &DCI,
29581 const X86Subtarget &Subtarget) {
29582 return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
29585 /// If a vector select has an operand that is -1 or 0, try to simplify the
29586 /// select to a bitwise logic operation.
29588 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
29589 TargetLowering::DAGCombinerInfo &DCI,
29590 const X86Subtarget &Subtarget) {
29591 SDValue Cond = N->getOperand(0);
29592 SDValue LHS = N->getOperand(1);
29593 SDValue RHS = N->getOperand(2);
29594 EVT VT = LHS.getValueType();
29595 EVT CondVT = Cond.getValueType();
29597 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29599 if (N->getOpcode() != ISD::VSELECT)
29602 assert(CondVT.isVector() && "Vector select expects a vector selector!");
29604 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29605 // Check if the first operand is all zeros and Cond type is vXi1.
29606 // This situation only applies to avx512.
29607 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
29608 CondVT.getVectorElementType() == MVT::i1) {
29609 // Invert the cond to not(cond) : xor(op,allones)=not(op)
29610 SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
29611 DAG.getAllOnesConstant(DL, CondVT));
29612 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
29613 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
29616 // To use the condition operand as a bitwise mask, it must have elements that
29617 // are the same size as the select elements. Ie, the condition operand must
29618 // have already been promoted from the IR select condition type <N x i1>.
29619 // Don't check if the types themselves are equal because that excludes
29620 // vector floating-point selects.
29621 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
29624 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
29625 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
29627 // Try to invert the condition if true value is not all 1s and false value is
29629 if (!TValIsAllOnes && !FValIsAllZeros &&
29630 // Check if the selector will be produced by CMPP*/PCMP*.
29631 Cond.getOpcode() == ISD::SETCC &&
29632 // Check if SETCC has already been promoted.
29633 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
29635 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29636 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
29638 if (TValIsAllZeros || FValIsAllOnes) {
29639 SDValue CC = Cond.getOperand(2);
29640 ISD::CondCode NewCC =
29641 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
29642 Cond.getOperand(0).getValueType().isInteger());
29643 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
29645 std::swap(LHS, RHS);
29646 TValIsAllOnes = FValIsAllOnes;
29647 FValIsAllZeros = TValIsAllZeros;
29651 // vselect Cond, 111..., 000... -> Cond
29652 if (TValIsAllOnes && FValIsAllZeros)
29653 return DAG.getBitcast(VT, Cond);
29655 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
29658 // vselect Cond, 111..., X -> or Cond, X
29659 if (TValIsAllOnes) {
29660 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
29661 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
29662 return DAG.getBitcast(VT, Or);
29665 // vselect Cond, X, 000... -> and Cond, X
29666 if (FValIsAllZeros) {
29667 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
29668 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
29669 return DAG.getBitcast(VT, And);
29675 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
29676 SDValue Cond = N->getOperand(0);
29677 SDValue LHS = N->getOperand(1);
29678 SDValue RHS = N->getOperand(2);
29681 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
29682 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
29683 if (!TrueC || !FalseC)
29686 // Don't do this for crazy integer types.
29687 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
29690 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
29691 // so that TrueC (the true value) is larger than FalseC.
29692 bool NeedsCondInvert = false;
29693 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
29694 // Efficiently invertible.
29695 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
29696 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
29697 isa<ConstantSDNode>(Cond.getOperand(1))))) {
29698 NeedsCondInvert = true;
29699 std::swap(TrueC, FalseC);
29702 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
29703 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
29704 if (NeedsCondInvert) // Invert the condition if needed.
29705 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29706 DAG.getConstant(1, DL, Cond.getValueType()));
29708 // Zero extend the condition if needed.
29709 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
29711 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
29712 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
29713 DAG.getConstant(ShAmt, DL, MVT::i8));
29716 // Optimize cases that will turn into an LEA instruction. This requires
29717 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
29718 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
29719 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
29720 if (N->getValueType(0) == MVT::i32)
29721 Diff = (unsigned)Diff;
29723 bool isFastMultiplier = false;
29725 switch ((unsigned char)Diff) {
29728 case 1: // result = add base, cond
29729 case 2: // result = lea base( , cond*2)
29730 case 3: // result = lea base(cond, cond*2)
29731 case 4: // result = lea base( , cond*4)
29732 case 5: // result = lea base(cond, cond*4)
29733 case 8: // result = lea base( , cond*8)
29734 case 9: // result = lea base(cond, cond*8)
29735 isFastMultiplier = true;
29740 if (isFastMultiplier) {
29741 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
29742 if (NeedsCondInvert) // Invert the condition if needed.
29743 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29744 DAG.getConstant(1, DL, Cond.getValueType()));
29746 // Zero extend the condition if needed.
29747 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
29748 // Scale the condition by the difference.
29750 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
29751 DAG.getConstant(Diff, DL, Cond.getValueType()));
29753 // Add the base if non-zero.
29754 if (FalseC->getAPIntValue() != 0)
29755 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29756 SDValue(FalseC, 0));
29764 // If this is a bitcasted op that can be represented as another type, push the
29765 // the bitcast to the inputs. This allows more opportunities for pattern
29766 // matching masked instructions. This is called when we know that the operation
29767 // is used as one of the inputs of a vselect.
29768 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
29769 TargetLowering::DAGCombinerInfo &DCI) {
29770 // Make sure we have a bitcast.
29771 if (OrigOp.getOpcode() != ISD::BITCAST)
29774 SDValue Op = OrigOp.getOperand(0);
29776 // If the operation is used by anything other than the bitcast, we shouldn't
29777 // do this combine as that would replicate the operation.
29778 if (!Op.hasOneUse())
29781 MVT VT = OrigOp.getSimpleValueType();
29782 MVT EltVT = VT.getVectorElementType();
29783 SDLoc DL(Op.getNode());
29785 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
29787 Op0 = DAG.getBitcast(VT, Op0);
29788 DCI.AddToWorklist(Op0.getNode());
29789 Op1 = DAG.getBitcast(VT, Op1);
29790 DCI.AddToWorklist(Op1.getNode());
29791 DCI.CombineTo(OrigOp.getNode(),
29792 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
29796 unsigned Opcode = Op.getOpcode();
29798 case X86ISD::PALIGNR:
29799 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
29800 if (!VT.is128BitVector())
29802 Opcode = X86ISD::VALIGN;
29804 case X86ISD::VALIGN: {
29805 if (EltVT != MVT::i32 && EltVT != MVT::i64)
29807 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29808 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29809 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
29810 unsigned EltSize = EltVT.getSizeInBits();
29811 // Make sure we can represent the same shift with the new VT.
29812 if ((ShiftAmt % EltSize) != 0)
29814 Imm = ShiftAmt / EltSize;
29815 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29816 DAG.getConstant(Imm, DL, MVT::i8));
29818 case X86ISD::SHUF128: {
29819 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
29821 // Only change element size, not type.
29822 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29824 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29827 case ISD::INSERT_SUBVECTOR: {
29828 unsigned EltSize = EltVT.getSizeInBits();
29829 if (EltSize != 32 && EltSize != 64)
29831 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29832 // Only change element size, not type.
29833 if (EltVT.isInteger() != OpEltVT.isInteger())
29835 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29836 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29837 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
29838 DCI.AddToWorklist(Op0.getNode());
29839 // Op1 needs to be bitcasted to a smaller vector with the same element type.
29840 SDValue Op1 = Op.getOperand(1);
29841 MVT Op1VT = MVT::getVectorVT(EltVT,
29842 Op1.getSimpleValueType().getSizeInBits() / EltSize);
29843 Op1 = DAG.getBitcast(Op1VT, Op1);
29844 DCI.AddToWorklist(Op1.getNode());
29845 DCI.CombineTo(OrigOp.getNode(),
29846 DAG.getNode(Opcode, DL, VT, Op0, Op1,
29847 DAG.getIntPtrConstant(Imm, DL)));
29850 case ISD::EXTRACT_SUBVECTOR: {
29851 unsigned EltSize = EltVT.getSizeInBits();
29852 if (EltSize != 32 && EltSize != 64)
29854 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29855 // Only change element size, not type.
29856 if (EltVT.isInteger() != OpEltVT.isInteger())
29858 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
29859 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29860 // Op0 needs to be bitcasted to a larger vector with the same element type.
29861 SDValue Op0 = Op.getOperand(0);
29862 MVT Op0VT = MVT::getVectorVT(EltVT,
29863 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29864 Op0 = DAG.getBitcast(Op0VT, Op0);
29865 DCI.AddToWorklist(Op0.getNode());
29866 DCI.CombineTo(OrigOp.getNode(),
29867 DAG.getNode(Opcode, DL, VT, Op0,
29868 DAG.getIntPtrConstant(Imm, DL)));
29871 case X86ISD::SUBV_BROADCAST: {
29872 unsigned EltSize = EltVT.getSizeInBits();
29873 if (EltSize != 32 && EltSize != 64)
29875 // Only change element size, not type.
29876 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29878 SDValue Op0 = Op.getOperand(0);
29879 MVT Op0VT = MVT::getVectorVT(EltVT,
29880 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29881 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
29882 DCI.AddToWorklist(Op0.getNode());
29883 DCI.CombineTo(OrigOp.getNode(),
29884 DAG.getNode(Opcode, DL, VT, Op0));
29892 /// Do target-specific dag combines on SELECT and VSELECT nodes.
29893 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
29894 TargetLowering::DAGCombinerInfo &DCI,
29895 const X86Subtarget &Subtarget) {
29897 SDValue Cond = N->getOperand(0);
29898 // Get the LHS/RHS of the select.
29899 SDValue LHS = N->getOperand(1);
29900 SDValue RHS = N->getOperand(2);
29901 EVT VT = LHS.getValueType();
29902 EVT CondVT = Cond.getValueType();
29903 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29905 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
29906 // instructions match the semantics of the common C idiom x<y?x:y but not
29907 // x<=y?x:y, because of how they handle negative zero (which can be
29908 // ignored in unsafe-math mode).
29909 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
29910 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
29911 VT != MVT::f80 && VT != MVT::f128 &&
29912 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
29913 (Subtarget.hasSSE2() ||
29914 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
29915 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
29917 unsigned Opcode = 0;
29918 // Check for x CC y ? x : y.
29919 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
29920 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
29924 // Converting this to a min would handle NaNs incorrectly, and swapping
29925 // the operands would cause it to handle comparisons between positive
29926 // and negative zero incorrectly.
29927 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29928 if (!DAG.getTarget().Options.UnsafeFPMath &&
29929 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29931 std::swap(LHS, RHS);
29933 Opcode = X86ISD::FMIN;
29936 // Converting this to a min would handle comparisons between positive
29937 // and negative zero incorrectly.
29938 if (!DAG.getTarget().Options.UnsafeFPMath &&
29939 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
29941 Opcode = X86ISD::FMIN;
29944 // Converting this to a min would handle both negative zeros and NaNs
29945 // incorrectly, but we can swap the operands to fix both.
29946 std::swap(LHS, RHS);
29950 Opcode = X86ISD::FMIN;
29954 // Converting this to a max would handle comparisons between positive
29955 // and negative zero incorrectly.
29956 if (!DAG.getTarget().Options.UnsafeFPMath &&
29957 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
29959 Opcode = X86ISD::FMAX;
29962 // Converting this to a max would handle NaNs incorrectly, and swapping
29963 // the operands would cause it to handle comparisons between positive
29964 // and negative zero incorrectly.
29965 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29966 if (!DAG.getTarget().Options.UnsafeFPMath &&
29967 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29969 std::swap(LHS, RHS);
29971 Opcode = X86ISD::FMAX;
29974 // Converting this to a max would handle both negative zeros and NaNs
29975 // incorrectly, but we can swap the operands to fix both.
29976 std::swap(LHS, RHS);
29980 Opcode = X86ISD::FMAX;
29983 // Check for x CC y ? y : x -- a min/max with reversed arms.
29984 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
29985 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
29989 // Converting this to a min would handle comparisons between positive
29990 // and negative zero incorrectly, and swapping the operands would
29991 // cause it to handle NaNs incorrectly.
29992 if (!DAG.getTarget().Options.UnsafeFPMath &&
29993 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
29994 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29996 std::swap(LHS, RHS);
29998 Opcode = X86ISD::FMIN;
30001 // Converting this to a min would handle NaNs incorrectly.
30002 if (!DAG.getTarget().Options.UnsafeFPMath &&
30003 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
30005 Opcode = X86ISD::FMIN;
30008 // Converting this to a min would handle both negative zeros and NaNs
30009 // incorrectly, but we can swap the operands to fix both.
30010 std::swap(LHS, RHS);
30014 Opcode = X86ISD::FMIN;
30018 // Converting this to a max would handle NaNs incorrectly.
30019 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30021 Opcode = X86ISD::FMAX;
30024 // Converting this to a max would handle comparisons between positive
30025 // and negative zero incorrectly, and swapping the operands would
30026 // cause it to handle NaNs incorrectly.
30027 if (!DAG.getTarget().Options.UnsafeFPMath &&
30028 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
30029 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30031 std::swap(LHS, RHS);
30033 Opcode = X86ISD::FMAX;
30036 // Converting this to a max would handle both negative zeros and NaNs
30037 // incorrectly, but we can swap the operands to fix both.
30038 std::swap(LHS, RHS);
30042 Opcode = X86ISD::FMAX;
30048 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30051 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30052 // lowering on KNL. In this case we convert it to
30053 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30054 // The same situation for all 128 and 256-bit vectors of i8 and i16.
30055 // Since SKX these selects have a proper lowering.
30056 if (Subtarget.hasAVX512() && CondVT.isVector() &&
30057 CondVT.getVectorElementType() == MVT::i1 &&
30058 (VT.is128BitVector() || VT.is256BitVector()) &&
30059 (VT.getVectorElementType() == MVT::i8 ||
30060 VT.getVectorElementType() == MVT::i16) &&
30061 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
30062 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30063 DCI.AddToWorklist(Cond.getNode());
30064 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30067 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
30070 // Canonicalize max and min:
30071 // (x > y) ? x : y -> (x >= y) ? x : y
30072 // (x < y) ? x : y -> (x <= y) ? x : y
30073 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30074 // the need for an extra compare
30075 // against zero. e.g.
30076 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30078 // testl %edi, %edi
30080 // cmovgl %edi, %eax
30084 // cmovsl %eax, %edi
30085 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30086 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30087 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30088 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30093 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30094 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30095 Cond.getOperand(0), Cond.getOperand(1), NewCC);
30096 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
30101 // Early exit check
30102 if (!TLI.isTypeLegal(VT))
30105 // Match VSELECTs into subs with unsigned saturation.
30106 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30107 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30108 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
30109 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
30110 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30112 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30113 // left side invert the predicate to simplify logic below.
30115 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30117 CC = ISD::getSetCCInverse(CC, true);
30118 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30122 if (Other.getNode() && Other->getNumOperands() == 2 &&
30123 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30124 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30125 SDValue CondRHS = Cond->getOperand(1);
30127 // Look for a general sub with unsigned saturation first.
30128 // x >= y ? x-y : 0 --> subus x, y
30129 // x > y ? x-y : 0 --> subus x, y
30130 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
30131 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30132 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30134 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30135 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30136 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30137 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30138 // If the RHS is a constant we have to reverse the const
30139 // canonicalization.
30140 // x > C-1 ? x+-C : 0 --> subus x, C
30141 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30142 CondRHSConst->getAPIntValue() ==
30143 (-OpRHSConst->getAPIntValue() - 1))
30144 return DAG.getNode(
30145 X86ISD::SUBUS, DL, VT, OpLHS,
30146 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30148 // Another special case: If C was a sign bit, the sub has been
30149 // canonicalized into a xor.
30150 // FIXME: Would it be better to use computeKnownBits to determine
30151 // whether it's safe to decanonicalize the xor?
30152 // x s< 0 ? x^C : 0 --> subus x, C
30153 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30154 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30155 OpRHSConst->getAPIntValue().isSignMask())
30156 // Note that we have to rebuild the RHS constant here to ensure we
30157 // don't rely on particular values of undef lanes.
30158 return DAG.getNode(
30159 X86ISD::SUBUS, DL, VT, OpLHS,
30160 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30165 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30168 // If this is a *dynamic* select (non-constant condition) and we can match
30169 // this node with one of the variable blend instructions, restructure the
30170 // condition so that blends can use the high (sign) bit of each element and
30171 // use SimplifyDemandedBits to simplify the condition operand.
30172 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30173 !DCI.isBeforeLegalize() &&
30174 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30175 unsigned BitWidth = Cond.getScalarValueSizeInBits();
30177 // Don't optimize vector selects that map to mask-registers.
30181 // We can only handle the cases where VSELECT is directly legal on the
30182 // subtarget. We custom lower VSELECT nodes with constant conditions and
30183 // this makes it hard to see whether a dynamic VSELECT will correctly
30184 // lower, so we both check the operation's status and explicitly handle the
30185 // cases where a *dynamic* blend will fail even though a constant-condition
30186 // blend could be custom lowered.
30187 // FIXME: We should find a better way to handle this class of problems.
30188 // Potentially, we should combine constant-condition vselect nodes
30189 // pre-legalization into shuffles and not mark as many types as custom
30191 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30193 // FIXME: We don't support i16-element blends currently. We could and
30194 // should support them by making *all* the bits in the condition be set
30195 // rather than just the high bit and using an i8-element blend.
30196 if (VT.getVectorElementType() == MVT::i16)
30198 // Dynamic blending was only available from SSE4.1 onward.
30199 if (VT.is128BitVector() && !Subtarget.hasSSE41())
30201 // Byte blends are only available in AVX2
30202 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30205 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30206 APInt DemandedMask(APInt::getSignMask(BitWidth));
30208 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
30209 DCI.isBeforeLegalizeOps());
30210 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
30211 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
30212 // If we changed the computation somewhere in the DAG, this change will
30213 // affect all users of Cond. Make sure it is fine and update all the nodes
30214 // so that we do not use the generic VSELECT anymore. Otherwise, we may
30215 // perform wrong optimizations as we messed with the actual expectation
30216 // for the vector boolean values.
30217 if (Cond != TLO.Old) {
30218 // Check all uses of the condition operand to check whether it will be
30219 // consumed by non-BLEND instructions. Those may require that all bits
30220 // are set properly.
30221 for (SDNode *U : Cond->uses()) {
30222 // TODO: Add other opcodes eventually lowered into BLEND.
30223 if (U->getOpcode() != ISD::VSELECT)
30227 // Update all users of the condition before committing the change, so
30228 // that the VSELECT optimizations that expect the correct vector boolean
30229 // value will not be triggered.
30230 for (SDNode *U : Cond->uses()) {
30231 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30232 U->getValueType(0), Cond, U->getOperand(1),
30234 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30236 DCI.CommitTargetLoweringOpt(TLO);
30239 // Only Cond (rather than other nodes in the computation chain) was
30240 // changed. Change the condition just for N to keep the opportunity to
30241 // optimize all other users their own way.
30242 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30243 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30248 // Look for vselects with LHS/RHS being bitcasted from an operation that
30249 // can be executed on another type. Push the bitcast to the inputs of
30250 // the operation. This exposes opportunities for using masking instructions.
30251 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30252 CondVT.getVectorElementType() == MVT::i1) {
30253 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30254 return SDValue(N, 0);
30255 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30256 return SDValue(N, 0);
30263 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30265 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30266 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30267 /// Note that this is only legal for some op/cc combinations.
30268 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30269 SelectionDAG &DAG) {
30270 // This combine only operates on CMP-like nodes.
30271 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30272 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30275 // Can't replace the cmp if it has more uses than the one we're looking at.
30276 // FIXME: We would like to be able to handle this, but would need to make sure
30277 // all uses were updated.
30278 if (!Cmp.hasOneUse())
30281 // This only applies to variations of the common case:
30282 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30283 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30284 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30285 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30286 // Using the proper condcodes (see below), overflow is checked for.
30288 // FIXME: We can generalize both constraints:
30289 // - XOR/OR/AND (if they were made to survive AtomicExpand)
30291 // if the result is compared.
30293 SDValue CmpLHS = Cmp.getOperand(0);
30294 SDValue CmpRHS = Cmp.getOperand(1);
30296 if (!CmpLHS.hasOneUse())
30299 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30300 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
30303 const unsigned Opc = CmpLHS.getOpcode();
30305 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30308 SDValue OpRHS = CmpLHS.getOperand(2);
30309 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30313 APInt Addend = OpRHSC->getAPIntValue();
30314 if (Opc == ISD::ATOMIC_LOAD_SUB)
30317 if (CC == X86::COND_S && Addend == 1)
30319 else if (CC == X86::COND_NS && Addend == 1)
30321 else if (CC == X86::COND_G && Addend == -1)
30323 else if (CC == X86::COND_LE && Addend == -1)
30328 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30329 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30330 DAG.getUNDEF(CmpLHS.getValueType()));
30331 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30335 // Check whether a boolean test is testing a boolean value generated by
30336 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30339 // Simplify the following patterns:
30340 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30341 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30342 // to (Op EFLAGS Cond)
30344 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30345 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30346 // to (Op EFLAGS !Cond)
30348 // where Op could be BRCOND or CMOV.
30350 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30351 // This combine only operates on CMP-like nodes.
30352 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30353 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30356 // Quit if not used as a boolean value.
30357 if (CC != X86::COND_E && CC != X86::COND_NE)
30360 // Check CMP operands. One of them should be 0 or 1 and the other should be
30361 // an SetCC or extended from it.
30362 SDValue Op1 = Cmp.getOperand(0);
30363 SDValue Op2 = Cmp.getOperand(1);
30366 const ConstantSDNode* C = nullptr;
30367 bool needOppositeCond = (CC == X86::COND_E);
30368 bool checkAgainstTrue = false; // Is it a comparison against 1?
30370 if ((C = dyn_cast<ConstantSDNode>(Op1)))
30372 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30374 else // Quit if all operands are not constants.
30377 if (C->getZExtValue() == 1) {
30378 needOppositeCond = !needOppositeCond;
30379 checkAgainstTrue = true;
30380 } else if (C->getZExtValue() != 0)
30381 // Quit if the constant is neither 0 or 1.
30384 bool truncatedToBoolWithAnd = false;
30385 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30386 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30387 SetCC.getOpcode() == ISD::TRUNCATE ||
30388 SetCC.getOpcode() == ISD::AND) {
30389 if (SetCC.getOpcode() == ISD::AND) {
30391 if (isOneConstant(SetCC.getOperand(0)))
30393 if (isOneConstant(SetCC.getOperand(1)))
30397 SetCC = SetCC.getOperand(OpIdx);
30398 truncatedToBoolWithAnd = true;
30400 SetCC = SetCC.getOperand(0);
30403 switch (SetCC.getOpcode()) {
30404 case X86ISD::SETCC_CARRY:
30405 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30406 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30407 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30408 // truncated to i1 using 'and'.
30409 if (checkAgainstTrue && !truncatedToBoolWithAnd)
30411 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30412 "Invalid use of SETCC_CARRY!");
30414 case X86ISD::SETCC:
30415 // Set the condition code or opposite one if necessary.
30416 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30417 if (needOppositeCond)
30418 CC = X86::GetOppositeBranchCondition(CC);
30419 return SetCC.getOperand(1);
30420 case X86ISD::CMOV: {
30421 // Check whether false/true value has canonical one, i.e. 0 or 1.
30422 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30423 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30424 // Quit if true value is not a constant.
30427 // Quit if false value is not a constant.
30429 SDValue Op = SetCC.getOperand(0);
30430 // Skip 'zext' or 'trunc' node.
30431 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30432 Op.getOpcode() == ISD::TRUNCATE)
30433 Op = Op.getOperand(0);
30434 // A special case for rdrand/rdseed, where 0 is set if false cond is
30436 if ((Op.getOpcode() != X86ISD::RDRAND &&
30437 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
30440 // Quit if false value is not the constant 0 or 1.
30441 bool FValIsFalse = true;
30442 if (FVal && FVal->getZExtValue() != 0) {
30443 if (FVal->getZExtValue() != 1)
30445 // If FVal is 1, opposite cond is needed.
30446 needOppositeCond = !needOppositeCond;
30447 FValIsFalse = false;
30449 // Quit if TVal is not the constant opposite of FVal.
30450 if (FValIsFalse && TVal->getZExtValue() != 1)
30452 if (!FValIsFalse && TVal->getZExtValue() != 0)
30454 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30455 if (needOppositeCond)
30456 CC = X86::GetOppositeBranchCondition(CC);
30457 return SetCC.getOperand(3);
30464 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30466 /// (X86or (X86setcc) (X86setcc))
30467 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
30468 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30469 X86::CondCode &CC1, SDValue &Flags,
30471 if (Cond->getOpcode() == X86ISD::CMP) {
30472 if (!isNullConstant(Cond->getOperand(1)))
30475 Cond = Cond->getOperand(0);
30480 SDValue SetCC0, SetCC1;
30481 switch (Cond->getOpcode()) {
30482 default: return false;
30489 SetCC0 = Cond->getOperand(0);
30490 SetCC1 = Cond->getOperand(1);
30494 // Make sure we have SETCC nodes, using the same flags value.
30495 if (SetCC0.getOpcode() != X86ISD::SETCC ||
30496 SetCC1.getOpcode() != X86ISD::SETCC ||
30497 SetCC0->getOperand(1) != SetCC1->getOperand(1))
30500 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30501 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30502 Flags = SetCC0->getOperand(1);
30506 /// Optimize an EFLAGS definition used according to the condition code \p CC
30507 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30508 /// uses of chain values.
30509 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30510 SelectionDAG &DAG) {
30511 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30513 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30516 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30517 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30518 TargetLowering::DAGCombinerInfo &DCI,
30519 const X86Subtarget &Subtarget) {
30522 // If the flag operand isn't dead, don't touch this CMOV.
30523 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
30526 SDValue FalseOp = N->getOperand(0);
30527 SDValue TrueOp = N->getOperand(1);
30528 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
30529 SDValue Cond = N->getOperand(3);
30531 if (CC == X86::COND_E || CC == X86::COND_NE) {
30532 switch (Cond.getOpcode()) {
30536 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
30537 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
30538 return (CC == X86::COND_E) ? FalseOp : TrueOp;
30542 // Try to simplify the EFLAGS and condition code operands.
30543 // We can't always do this as FCMOV only supports a subset of X86 cond.
30544 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
30545 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
30546 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
30548 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30552 // If this is a select between two integer constants, try to do some
30553 // optimizations. Note that the operands are ordered the opposite of SELECT
30555 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
30556 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
30557 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
30558 // larger than FalseC (the false value).
30559 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
30560 CC = X86::GetOppositeBranchCondition(CC);
30561 std::swap(TrueC, FalseC);
30562 std::swap(TrueOp, FalseOp);
30565 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
30566 // This is efficient for any integer data type (including i8/i16) and
30568 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30569 Cond = getSETCC(CC, Cond, DL, DAG);
30571 // Zero extend the condition if needed.
30572 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
30574 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30575 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
30576 DAG.getConstant(ShAmt, DL, MVT::i8));
30577 if (N->getNumValues() == 2) // Dead flag value?
30578 return DCI.CombineTo(N, Cond, SDValue());
30582 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
30583 // for any integer data type, including i8/i16.
30584 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
30585 Cond = getSETCC(CC, Cond, DL, DAG);
30587 // Zero extend the condition if needed.
30588 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
30589 FalseC->getValueType(0), Cond);
30590 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30591 SDValue(FalseC, 0));
30593 if (N->getNumValues() == 2) // Dead flag value?
30594 return DCI.CombineTo(N, Cond, SDValue());
30598 // Optimize cases that will turn into an LEA instruction. This requires
30599 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30600 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
30601 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
30602 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
30604 bool isFastMultiplier = false;
30606 switch ((unsigned char)Diff) {
30608 case 1: // result = add base, cond
30609 case 2: // result = lea base( , cond*2)
30610 case 3: // result = lea base(cond, cond*2)
30611 case 4: // result = lea base( , cond*4)
30612 case 5: // result = lea base(cond, cond*4)
30613 case 8: // result = lea base( , cond*8)
30614 case 9: // result = lea base(cond, cond*8)
30615 isFastMultiplier = true;
30620 if (isFastMultiplier) {
30621 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
30622 Cond = getSETCC(CC, Cond, DL ,DAG);
30623 // Zero extend the condition if needed.
30624 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
30626 // Scale the condition by the difference.
30628 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30629 DAG.getConstant(Diff, DL, Cond.getValueType()));
30631 // Add the base if non-zero.
30632 if (FalseC->getAPIntValue() != 0)
30633 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30634 SDValue(FalseC, 0));
30635 if (N->getNumValues() == 2) // Dead flag value?
30636 return DCI.CombineTo(N, Cond, SDValue());
30643 // Handle these cases:
30644 // (select (x != c), e, c) -> select (x != c), e, x),
30645 // (select (x == c), c, e) -> select (x == c), x, e)
30646 // where the c is an integer constant, and the "select" is the combination
30647 // of CMOV and CMP.
30649 // The rationale for this change is that the conditional-move from a constant
30650 // needs two instructions, however, conditional-move from a register needs
30651 // only one instruction.
30653 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
30654 // some instruction-combining opportunities. This opt needs to be
30655 // postponed as late as possible.
30657 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
30658 // the DCI.xxxx conditions are provided to postpone the optimization as
30659 // late as possible.
30661 ConstantSDNode *CmpAgainst = nullptr;
30662 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
30663 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
30664 !isa<ConstantSDNode>(Cond.getOperand(0))) {
30666 if (CC == X86::COND_NE &&
30667 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
30668 CC = X86::GetOppositeBranchCondition(CC);
30669 std::swap(TrueOp, FalseOp);
30672 if (CC == X86::COND_E &&
30673 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
30674 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
30675 DAG.getConstant(CC, DL, MVT::i8), Cond };
30676 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
30681 // Fold and/or of setcc's to double CMOV:
30682 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
30683 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
30685 // This combine lets us generate:
30686 // cmovcc1 (jcc1 if we don't have CMOV)
30692 // cmovne (jne if we don't have CMOV)
30693 // When we can't use the CMOV instruction, it might increase branch
30695 // When we can use CMOV, or when there is no mispredict, this improves
30696 // throughput and reduces register pressure.
30698 if (CC == X86::COND_NE) {
30700 X86::CondCode CC0, CC1;
30702 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
30704 std::swap(FalseOp, TrueOp);
30705 CC0 = X86::GetOppositeBranchCondition(CC0);
30706 CC1 = X86::GetOppositeBranchCondition(CC1);
30709 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
30711 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
30712 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
30713 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30714 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
30722 /// Different mul shrinking modes.
30723 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
30725 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
30726 EVT VT = N->getOperand(0).getValueType();
30727 if (VT.getScalarSizeInBits() != 32)
30730 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
30731 unsigned SignBits[2] = {1, 1};
30732 bool IsPositive[2] = {false, false};
30733 for (unsigned i = 0; i < 2; i++) {
30734 SDValue Opd = N->getOperand(i);
30736 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
30737 // compute signbits for it separately.
30738 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
30739 // For anyextend, it is safe to assume an appropriate number of leading
30741 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
30743 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
30748 IsPositive[i] = true;
30749 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
30750 // All the operands of BUILD_VECTOR need to be int constant.
30751 // Find the smallest value range which all the operands belong to.
30753 IsPositive[i] = true;
30754 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
30755 if (SubOp.isUndef())
30757 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
30760 APInt IntVal = CN->getAPIntValue();
30761 if (IntVal.isNegative())
30762 IsPositive[i] = false;
30763 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
30766 SignBits[i] = DAG.ComputeNumSignBits(Opd);
30767 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
30768 IsPositive[i] = true;
30772 bool AllPositive = IsPositive[0] && IsPositive[1];
30773 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
30774 // When ranges are from -128 ~ 127, use MULS8 mode.
30775 if (MinSignBits >= 25)
30777 // When ranges are from 0 ~ 255, use MULU8 mode.
30778 else if (AllPositive && MinSignBits >= 24)
30780 // When ranges are from -32768 ~ 32767, use MULS16 mode.
30781 else if (MinSignBits >= 17)
30783 // When ranges are from 0 ~ 65535, use MULU16 mode.
30784 else if (AllPositive && MinSignBits >= 16)
30791 /// When the operands of vector mul are extended from smaller size values,
30792 /// like i8 and i16, the type of mul may be shrinked to generate more
30793 /// efficient code. Two typical patterns are handled:
30795 /// %2 = sext/zext <N x i8> %1 to <N x i32>
30796 /// %4 = sext/zext <N x i8> %3 to <N x i32>
30797 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30798 /// %5 = mul <N x i32> %2, %4
30801 /// %2 = zext/sext <N x i16> %1 to <N x i32>
30802 /// %4 = zext/sext <N x i16> %3 to <N x i32>
30803 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30804 /// %5 = mul <N x i32> %2, %4
30806 /// There are four mul shrinking modes:
30807 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
30808 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
30809 /// generate pmullw+sext32 for it (MULS8 mode).
30810 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
30811 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
30812 /// generate pmullw+zext32 for it (MULU8 mode).
30813 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
30814 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
30815 /// generate pmullw+pmulhw for it (MULS16 mode).
30816 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
30817 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
30818 /// generate pmullw+pmulhuw for it (MULU16 mode).
30819 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
30820 const X86Subtarget &Subtarget) {
30821 // Check for legality
30822 // pmullw/pmulhw are not supported by SSE.
30823 if (!Subtarget.hasSSE2())
30826 // Check for profitability
30827 // pmulld is supported since SSE41. It is better to use pmulld
30828 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
30830 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
30831 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
30835 if (!canReduceVMulWidth(N, DAG, Mode))
30839 SDValue N0 = N->getOperand(0);
30840 SDValue N1 = N->getOperand(1);
30841 EVT VT = N->getOperand(0).getValueType();
30842 unsigned RegSize = 128;
30843 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
30845 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
30846 // Shrink the operands of mul.
30847 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
30848 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
30850 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
30851 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
30852 // lower part is needed.
30853 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
30854 if (Mode == MULU8 || Mode == MULS8) {
30855 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
30858 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30859 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
30860 // the higher part is also needed.
30861 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30862 ReducedVT, NewN0, NewN1);
30864 // Repack the lower part and higher part result of mul into a wider
30866 // Generate shuffle functioning as punpcklwd.
30867 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
30868 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30869 ShuffleMask[2 * i] = i;
30870 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
30873 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30874 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
30875 // Generate shuffle functioning as punpckhwd.
30876 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30877 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
30878 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
30881 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30882 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
30883 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
30886 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
30887 // to legalize the mul explicitly because implicit legalization for type
30888 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
30889 // instructions which will not exist when we explicitly legalize it by
30890 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
30891 // <4 x i16> undef).
30893 // Legalize the operands of mul.
30894 // FIXME: We may be able to handle non-concatenated vectors by insertion.
30895 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
30896 if ((RegSize % ReducedSizeInBits) != 0)
30899 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
30900 DAG.getUNDEF(ReducedVT));
30902 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30904 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30906 if (Mode == MULU8 || Mode == MULS8) {
30907 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
30909 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30911 // convert the type of mul result to VT.
30912 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30913 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
30914 : ISD::SIGN_EXTEND_VECTOR_INREG,
30916 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30917 DAG.getIntPtrConstant(0, DL));
30919 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
30920 // MULU16/MULS16, both parts are needed.
30921 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30922 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30923 OpsVT, NewN0, NewN1);
30925 // Repack the lower part and higher part result of mul into a wider
30926 // result. Make sure the type of mul result is VT.
30927 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30928 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
30929 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
30930 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30931 DAG.getIntPtrConstant(0, DL));
30936 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
30937 EVT VT, SDLoc DL) {
30939 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
30940 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
30941 DAG.getConstant(Mult, DL, VT));
30942 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
30943 DAG.getConstant(Shift, DL, MVT::i8));
30944 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, N->getOperand(0),
30949 auto combineMulMulAddOrSub = [&](bool isAdd) {
30950 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
30951 DAG.getConstant(9, DL, VT));
30952 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
30953 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, N->getOperand(0),
30962 // mul x, 11 => add ((shl (mul x, 5), 1), x)
30963 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
30965 // mul x, 21 => add ((shl (mul x, 5), 2), x)
30966 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
30968 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
30969 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
30970 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
30972 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
30973 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
30975 // mul x, 13 => add ((shl (mul x, 3), 2), x)
30976 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
30978 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
30979 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
30981 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
30982 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
30983 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
30985 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
30986 return combineMulMulAddOrSub(/*isAdd*/ false);
30988 // mul x, 28 => add ((mul (mul x, 9), 3), x)
30989 return combineMulMulAddOrSub(/*isAdd*/ true);
30991 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
30992 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
30993 combineMulMulAddOrSub(/*isAdd*/ true));
30995 // mul x, 30 => sub (sub ((shl x, 5), x), x)
30996 return DAG.getNode(
30997 ISD::SUB, DL, VT, N->getOperand(0),
30998 DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0),
30999 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31000 DAG.getConstant(5, DL, MVT::i8))));
31005 /// Optimize a single multiply with constant into two operations in order to
31006 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
31007 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
31008 TargetLowering::DAGCombinerInfo &DCI,
31009 const X86Subtarget &Subtarget) {
31010 EVT VT = N->getValueType(0);
31011 if (DCI.isBeforeLegalize() && VT.isVector())
31012 return reduceVMULWidth(N, DAG, Subtarget);
31014 if (!MulConstantOptimization)
31016 // An imul is usually smaller than the alternative sequence.
31017 if (DAG.getMachineFunction().getFunction()->optForMinSize())
31020 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
31023 if (VT != MVT::i64 && VT != MVT::i32)
31026 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
31029 uint64_t MulAmt = C->getZExtValue();
31030 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
31033 uint64_t MulAmt1 = 0;
31034 uint64_t MulAmt2 = 0;
31035 if ((MulAmt % 9) == 0) {
31037 MulAmt2 = MulAmt / 9;
31038 } else if ((MulAmt % 5) == 0) {
31040 MulAmt2 = MulAmt / 5;
31041 } else if ((MulAmt % 3) == 0) {
31043 MulAmt2 = MulAmt / 3;
31049 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
31051 if (isPowerOf2_64(MulAmt2) &&
31052 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
31053 // If second multiplifer is pow2, issue it first. We want the multiply by
31054 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
31056 std::swap(MulAmt1, MulAmt2);
31058 if (isPowerOf2_64(MulAmt1))
31059 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31060 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
31062 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31063 DAG.getConstant(MulAmt1, DL, VT));
31065 if (isPowerOf2_64(MulAmt2))
31066 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
31067 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
31069 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
31070 DAG.getConstant(MulAmt2, DL, VT));
31071 } else if (!Subtarget.slowLEA())
31072 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
31075 assert(MulAmt != 0 &&
31076 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
31077 "Both cases that could cause potential overflows should have "
31078 "already been handled.");
31079 int64_t SignMulAmt = C->getSExtValue();
31080 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
31081 (SignMulAmt != -INT64_MAX)) {
31082 int NumSign = SignMulAmt > 0 ? 1 : -1;
31083 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
31084 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31085 if (IsPowerOf2_64PlusOne) {
31086 // (mul x, 2^N + 1) => (add (shl x, N), x)
31087 NewMul = DAG.getNode(
31088 ISD::ADD, DL, VT, N->getOperand(0),
31089 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31090 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31092 } else if (IsPowerOf2_64MinusOne) {
31093 // (mul x, 2^N - 1) => (sub (shl x, N), x)
31094 NewMul = DAG.getNode(
31096 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31097 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31101 // To negate, subtract the number from zero
31102 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
31104 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31109 // Do not add new nodes to DAG combiner worklist.
31110 DCI.CombineTo(N, NewMul, false);
31115 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31116 SDValue N0 = N->getOperand(0);
31117 SDValue N1 = N->getOperand(1);
31118 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31119 EVT VT = N0.getValueType();
31121 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31122 // since the result of setcc_c is all zero's or all ones.
31123 if (VT.isInteger() && !VT.isVector() &&
31124 N1C && N0.getOpcode() == ISD::AND &&
31125 N0.getOperand(1).getOpcode() == ISD::Constant) {
31126 SDValue N00 = N0.getOperand(0);
31127 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31128 Mask <<= N1C->getAPIntValue();
31129 bool MaskOK = false;
31130 // We can handle cases concerning bit-widening nodes containing setcc_c if
31131 // we carefully interrogate the mask to make sure we are semantics
31133 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31134 // of the underlying setcc_c operation if the setcc_c was zero extended.
31135 // Consider the following example:
31136 // zext(setcc_c) -> i32 0x0000FFFF
31137 // c1 -> i32 0x0000FFFF
31138 // c2 -> i32 0x00000001
31139 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31140 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
31141 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
31143 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
31144 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31146 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
31147 N00.getOpcode() == ISD::ANY_EXTEND) &&
31148 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31149 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31151 if (MaskOK && Mask != 0) {
31153 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31157 // Hardware support for vector shifts is sparse which makes us scalarize the
31158 // vector operations in many cases. Also, on sandybridge ADD is faster than
31160 // (shl V, 1) -> add V,V
31161 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31162 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31163 assert(N0.getValueType().isVector() && "Invalid vector shift type");
31164 // We shift all of the values by one. In many cases we do not have
31165 // hardware support for this operation. This is better expressed as an ADD
31167 if (N1SplatC->getAPIntValue() == 1)
31168 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31174 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31175 SDValue N0 = N->getOperand(0);
31176 SDValue N1 = N->getOperand(1);
31177 EVT VT = N0.getValueType();
31178 unsigned Size = VT.getSizeInBits();
31180 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31181 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31182 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31183 // depending on sign of (SarConst - [56,48,32,24,16])
31185 // sexts in X86 are MOVs. The MOVs have the same code size
31186 // as above SHIFTs (only SHIFT on 1 has lower code size).
31187 // However the MOVs have 2 advantages to a SHIFT:
31188 // 1. MOVs can write to a register that differs from source
31189 // 2. MOVs accept memory operands
31191 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
31192 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
31193 N0.getOperand(1).getOpcode() != ISD::Constant)
31196 SDValue N00 = N0.getOperand(0);
31197 SDValue N01 = N0.getOperand(1);
31198 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31199 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31200 EVT CVT = N1.getValueType();
31202 if (SarConst.isNegative())
31205 for (MVT SVT : MVT::integer_valuetypes()) {
31206 unsigned ShiftSize = SVT.getSizeInBits();
31207 // skipping types without corresponding sext/zext and
31208 // ShlConst that is not one of [56,48,32,24,16]
31209 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
31213 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31214 SarConst = SarConst - (Size - ShiftSize);
31217 else if (SarConst.isNegative())
31218 return DAG.getNode(ISD::SHL, DL, VT, NN,
31219 DAG.getConstant(-SarConst, DL, CVT));
31221 return DAG.getNode(ISD::SRA, DL, VT, NN,
31222 DAG.getConstant(SarConst, DL, CVT));
31227 /// \brief Returns a vector of 0s if the node in input is a vector logical
31228 /// shift by a constant amount which is known to be bigger than or equal
31229 /// to the vector element size in bits.
31230 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31231 const X86Subtarget &Subtarget) {
31232 EVT VT = N->getValueType(0);
31234 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31235 (!Subtarget.hasInt256() ||
31236 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31239 SDValue Amt = N->getOperand(1);
31241 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31242 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31243 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31244 unsigned MaxAmount =
31245 VT.getSimpleVT().getScalarSizeInBits();
31247 // SSE2/AVX2 logical shifts always return a vector of 0s
31248 // if the shift amount is bigger than or equal to
31249 // the element size. The constant shift amount will be
31250 // encoded as a 8-bit immediate.
31251 if (ShiftAmt.trunc(8).uge(MaxAmount))
31252 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31258 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31259 TargetLowering::DAGCombinerInfo &DCI,
31260 const X86Subtarget &Subtarget) {
31261 if (N->getOpcode() == ISD::SHL)
31262 if (SDValue V = combineShiftLeft(N, DAG))
31265 if (N->getOpcode() == ISD::SRA)
31266 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31269 // Try to fold this logical shift into a zero vector.
31270 if (N->getOpcode() != ISD::SRA)
31271 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31277 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31278 TargetLowering::DAGCombinerInfo &DCI,
31279 const X86Subtarget &Subtarget) {
31280 unsigned Opcode = N->getOpcode();
31281 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31282 X86ISD::VSRLI == Opcode) &&
31283 "Unexpected shift opcode");
31284 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31285 EVT VT = N->getValueType(0);
31286 SDValue N0 = N->getOperand(0);
31287 SDValue N1 = N->getOperand(1);
31288 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31289 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31290 "Unexpected value type");
31292 // Out of range logical bit shifts are guaranteed to be zero.
31293 // Out of range arithmetic bit shifts splat the sign bit.
31294 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31295 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31297 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31299 ShiftVal = NumBitsPerElt - 1;
31302 // Shift N0 by zero -> N0.
31306 // Shift zero -> zero.
31307 if (ISD::isBuildVectorAllZeros(N0.getNode()))
31308 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31310 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31311 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31312 // TODO - support other sra opcodes as needed.
31313 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31314 N0.getOpcode() == X86ISD::VSRAI)
31315 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31317 // We can decode 'whole byte' logical bit shifts as shuffles.
31318 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31320 SmallVector<int, 1> NonceMask; // Just a placeholder.
31321 NonceMask.push_back(0);
31322 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31323 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31325 return SDValue(); // This routine will use CombineTo to replace N.
31328 // Constant Folding.
31330 SmallVector<APInt, 32> EltBits;
31331 if (N->isOnlyUserOf(N0.getNode()) &&
31332 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31333 assert(EltBits.size() == VT.getVectorNumElements() &&
31334 "Unexpected shift value type");
31335 unsigned ShiftImm = ShiftVal.getZExtValue();
31336 for (APInt &Elt : EltBits) {
31337 if (X86ISD::VSHLI == Opcode)
31339 else if (X86ISD::VSRAI == Opcode)
31340 Elt.ashrInPlace(ShiftImm);
31342 Elt.lshrInPlace(ShiftImm);
31344 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31350 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31351 TargetLowering::DAGCombinerInfo &DCI,
31352 const X86Subtarget &Subtarget) {
31354 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31355 (N->getOpcode() == X86ISD::PINSRW &&
31356 N->getValueType(0) == MVT::v8i16)) &&
31357 "Unexpected vector insertion");
31359 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31361 SmallVector<int, 1> NonceMask; // Just a placeholder.
31362 NonceMask.push_back(0);
31363 combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31364 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31369 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31370 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31371 /// OR -> CMPNEQSS.
31372 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31373 TargetLowering::DAGCombinerInfo &DCI,
31374 const X86Subtarget &Subtarget) {
31377 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31378 // we're requiring SSE2 for both.
31379 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31380 SDValue N0 = N->getOperand(0);
31381 SDValue N1 = N->getOperand(1);
31382 SDValue CMP0 = N0->getOperand(1);
31383 SDValue CMP1 = N1->getOperand(1);
31386 // The SETCCs should both refer to the same CMP.
31387 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
31390 SDValue CMP00 = CMP0->getOperand(0);
31391 SDValue CMP01 = CMP0->getOperand(1);
31392 EVT VT = CMP00.getValueType();
31394 if (VT == MVT::f32 || VT == MVT::f64) {
31395 bool ExpectingFlags = false;
31396 // Check for any users that want flags:
31397 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31398 !ExpectingFlags && UI != UE; ++UI)
31399 switch (UI->getOpcode()) {
31404 ExpectingFlags = true;
31406 case ISD::CopyToReg:
31407 case ISD::SIGN_EXTEND:
31408 case ISD::ZERO_EXTEND:
31409 case ISD::ANY_EXTEND:
31413 if (!ExpectingFlags) {
31414 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31415 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31417 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
31418 X86::CondCode tmp = cc0;
31423 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
31424 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31425 // FIXME: need symbolic constants for these magic numbers.
31426 // See X86ATTInstPrinter.cpp:printSSECC().
31427 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31428 if (Subtarget.hasAVX512()) {
31430 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
31431 DAG.getConstant(x86cc, DL, MVT::i8));
31432 return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
31433 FSetCC, DAG.getIntPtrConstant(0, DL));
31435 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31436 CMP00.getValueType(), CMP00, CMP01,
31437 DAG.getConstant(x86cc, DL,
31440 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31441 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31443 if (is64BitFP && !Subtarget.is64Bit()) {
31444 // On a 32-bit target, we cannot bitcast the 64-bit float to a
31445 // 64-bit integer, since that's not a legal type. Since
31446 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31447 // bits, but can do this little dance to extract the lowest 32 bits
31448 // and work with those going forward.
31449 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31451 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31452 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31453 Vector32, DAG.getIntPtrConstant(0, DL));
31457 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31458 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31459 DAG.getConstant(1, DL, IntVT));
31460 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31462 return OneBitOfTruth;
31470 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31471 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31472 assert(N->getOpcode() == ISD::AND);
31474 EVT VT = N->getValueType(0);
31475 SDValue N0 = N->getOperand(0);
31476 SDValue N1 = N->getOperand(1);
31479 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31482 if (N0.getOpcode() == ISD::XOR &&
31483 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31484 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31486 if (N1.getOpcode() == ISD::XOR &&
31487 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31488 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31493 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31494 // register. In most cases we actually compare or select YMM-sized registers
31495 // and mixing the two types creates horrible code. This method optimizes
31496 // some of the transition sequences.
31497 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31498 TargetLowering::DAGCombinerInfo &DCI,
31499 const X86Subtarget &Subtarget) {
31500 EVT VT = N->getValueType(0);
31501 if (!VT.is256BitVector())
31504 assert((N->getOpcode() == ISD::ANY_EXTEND ||
31505 N->getOpcode() == ISD::ZERO_EXTEND ||
31506 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
31508 SDValue Narrow = N->getOperand(0);
31509 EVT NarrowVT = Narrow->getValueType(0);
31510 if (!NarrowVT.is128BitVector())
31513 if (Narrow->getOpcode() != ISD::XOR &&
31514 Narrow->getOpcode() != ISD::AND &&
31515 Narrow->getOpcode() != ISD::OR)
31518 SDValue N0 = Narrow->getOperand(0);
31519 SDValue N1 = Narrow->getOperand(1);
31522 // The Left side has to be a trunc.
31523 if (N0.getOpcode() != ISD::TRUNCATE)
31526 // The type of the truncated inputs.
31527 EVT WideVT = N0->getOperand(0)->getValueType(0);
31531 // The right side has to be a 'trunc' or a constant vector.
31532 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
31533 ConstantSDNode *RHSConstSplat = nullptr;
31534 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
31535 RHSConstSplat = RHSBV->getConstantSplatNode();
31536 if (!RHSTrunc && !RHSConstSplat)
31539 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31541 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
31544 // Set N0 and N1 to hold the inputs to the new wide operation.
31545 N0 = N0->getOperand(0);
31546 if (RHSConstSplat) {
31547 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
31548 SDValue(RHSConstSplat, 0));
31549 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
31550 } else if (RHSTrunc) {
31551 N1 = N1->getOperand(0);
31554 // Generate the wide operation.
31555 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
31556 unsigned Opcode = N->getOpcode();
31558 case ISD::ANY_EXTEND:
31560 case ISD::ZERO_EXTEND: {
31561 unsigned InBits = NarrowVT.getScalarSizeInBits();
31562 APInt Mask = APInt::getAllOnesValue(InBits);
31563 Mask = Mask.zext(VT.getScalarSizeInBits());
31564 return DAG.getNode(ISD::AND, DL, VT,
31565 Op, DAG.getConstant(Mask, DL, VT));
31567 case ISD::SIGN_EXTEND:
31568 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
31569 Op, DAG.getValueType(NarrowVT));
31571 llvm_unreachable("Unexpected opcode");
31575 /// If both input operands of a logic op are being cast from floating point
31576 /// types, try to convert this into a floating point logic node to avoid
31577 /// unnecessary moves from SSE to integer registers.
31578 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
31579 const X86Subtarget &Subtarget) {
31580 unsigned FPOpcode = ISD::DELETED_NODE;
31581 if (N->getOpcode() == ISD::AND)
31582 FPOpcode = X86ISD::FAND;
31583 else if (N->getOpcode() == ISD::OR)
31584 FPOpcode = X86ISD::FOR;
31585 else if (N->getOpcode() == ISD::XOR)
31586 FPOpcode = X86ISD::FXOR;
31588 assert(FPOpcode != ISD::DELETED_NODE &&
31589 "Unexpected input node for FP logic conversion");
31591 EVT VT = N->getValueType(0);
31592 SDValue N0 = N->getOperand(0);
31593 SDValue N1 = N->getOperand(1);
31595 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
31596 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
31597 (Subtarget.hasSSE2() && VT == MVT::i64))) {
31598 SDValue N00 = N0.getOperand(0);
31599 SDValue N10 = N1.getOperand(0);
31600 EVT N00Type = N00.getValueType();
31601 EVT N10Type = N10.getValueType();
31602 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
31603 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
31604 return DAG.getBitcast(VT, FPLogic);
31610 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
31611 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
31612 /// with a shift-right to eliminate loading the vector constant mask value.
31613 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
31614 const X86Subtarget &Subtarget) {
31615 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
31616 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
31617 EVT VT0 = Op0.getValueType();
31618 EVT VT1 = Op1.getValueType();
31620 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
31624 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
31625 !SplatVal.isMask())
31628 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
31631 unsigned EltBitWidth = VT0.getScalarSizeInBits();
31632 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
31636 unsigned ShiftVal = SplatVal.countTrailingOnes();
31637 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
31638 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
31639 return DAG.getBitcast(N->getValueType(0), Shift);
31642 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
31643 TargetLowering::DAGCombinerInfo &DCI,
31644 const X86Subtarget &Subtarget) {
31645 if (DCI.isBeforeLegalizeOps())
31648 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31651 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31654 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
31657 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
31660 EVT VT = N->getValueType(0);
31661 SDValue N0 = N->getOperand(0);
31662 SDValue N1 = N->getOperand(1);
31665 // Attempt to recursively combine a bitmask AND with shuffles.
31666 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
31668 SmallVector<int, 1> NonceMask; // Just a placeholder.
31669 NonceMask.push_back(0);
31670 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31671 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31673 return SDValue(); // This routine will use CombineTo to replace N.
31676 // Create BEXTR instructions
31677 // BEXTR is ((X >> imm) & (2**size-1))
31678 if (VT != MVT::i32 && VT != MVT::i64)
31681 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
31683 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
31686 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
31687 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
31688 if (MaskNode && ShiftNode) {
31689 uint64_t Mask = MaskNode->getZExtValue();
31690 uint64_t Shift = ShiftNode->getZExtValue();
31691 if (isMask_64(Mask)) {
31692 uint64_t MaskSize = countPopulation(Mask);
31693 if (Shift + MaskSize <= VT.getSizeInBits())
31694 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
31695 DAG.getConstant(Shift | (MaskSize << 8), DL,
31703 // (or (and (m, y), (pandn m, x)))
31705 // (vselect m, x, y)
31706 // As a special case, try to fold:
31707 // (or (and (m, (sub 0, x)), (pandn m, x)))
31709 // (sub (xor X, M), M)
31710 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
31711 const X86Subtarget &Subtarget) {
31712 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
31714 SDValue N0 = N->getOperand(0);
31715 SDValue N1 = N->getOperand(1);
31716 EVT VT = N->getValueType(0);
31718 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
31719 (VT.is256BitVector() && Subtarget.hasInt256())))
31722 // Canonicalize AND to LHS.
31723 if (N1.getOpcode() == ISD::AND)
31726 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
31727 // ANDNP combine allows other combines to happen that prevent matching.
31728 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
31731 SDValue Mask = N1.getOperand(0);
31732 SDValue X = N1.getOperand(1);
31734 if (N0.getOperand(0) == Mask)
31735 Y = N0.getOperand(1);
31736 if (N0.getOperand(1) == Mask)
31737 Y = N0.getOperand(0);
31739 // Check to see if the mask appeared in both the AND and ANDNP.
31743 // Validate that X, Y, and Mask are bitcasts, and see through them.
31744 Mask = peekThroughBitcasts(Mask);
31745 X = peekThroughBitcasts(X);
31746 Y = peekThroughBitcasts(Y);
31748 EVT MaskVT = Mask.getValueType();
31749 unsigned EltBits = MaskVT.getScalarSizeInBits();
31751 // TODO: Attempt to handle floating point cases as well?
31752 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
31758 // (or (and (M, (sub 0, X)), (pandn M, X)))
31759 // which is a special case of vselect:
31760 // (vselect M, (sub 0, X), X)
31762 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
31763 // We know that, if fNegate is 0 or 1:
31764 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
31766 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
31767 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
31768 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
31769 // This lets us transform our vselect to:
31770 // (add (xor X, M), (and M, 1))
31772 // (sub (xor X, M), M)
31773 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
31774 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
31775 auto IsNegV = [](SDNode *N, SDValue V) {
31776 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
31777 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
31780 if (IsNegV(Y.getNode(), X))
31782 else if (IsNegV(X.getNode(), Y))
31786 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
31787 SDValue SubOp2 = Mask;
31789 // If the negate was on the false side of the select, then
31790 // the operands of the SUB need to be swapped. PR 27251.
31791 // This is because the pattern being matched above is
31792 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
31793 // but if the pattern matched was
31794 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
31795 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
31796 // pattern also needs to be a negation of the replacement pattern above.
31797 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
31798 // sub accomplishes the negation of the replacement pattern.
31800 std::swap(SubOp1, SubOp2);
31802 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
31803 return DAG.getBitcast(VT, Res);
31807 // PBLENDVB is only available on SSE 4.1.
31808 if (!Subtarget.hasSSE41())
31811 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
31813 X = DAG.getBitcast(BlendVT, X);
31814 Y = DAG.getBitcast(BlendVT, Y);
31815 Mask = DAG.getBitcast(BlendVT, Mask);
31816 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
31817 return DAG.getBitcast(VT, Mask);
31820 // Helper function for combineOrCmpEqZeroToCtlzSrl
31824 // srl(ctlz x), log2(bitsize(x))
31825 // Input pattern is checked by caller.
31826 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
31827 SelectionDAG &DAG) {
31828 SDValue Cmp = Op.getOperand(1);
31829 EVT VT = Cmp.getOperand(0).getValueType();
31830 unsigned Log2b = Log2_32(VT.getSizeInBits());
31832 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
31833 // The result of the shift is true or false, and on X86, the 32-bit
31834 // encoding of shr and lzcnt is more desirable.
31835 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
31836 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
31837 DAG.getConstant(Log2b, dl, VT));
31838 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
31841 // Try to transform:
31842 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
31844 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
31845 // Will also attempt to match more generic cases, eg:
31846 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
31847 // Only applies if the target supports the FastLZCNT feature.
31848 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
31849 TargetLowering::DAGCombinerInfo &DCI,
31850 const X86Subtarget &Subtarget) {
31851 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
31854 auto isORCandidate = [](SDValue N) {
31855 return (N->getOpcode() == ISD::OR && N->hasOneUse());
31858 // Check the zero extend is extending to 32-bit or more. The code generated by
31859 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
31860 // instructions to clear the upper bits.
31861 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
31862 !isORCandidate(N->getOperand(0)))
31865 // Check the node matches: setcc(eq, cmp 0)
31866 auto isSetCCCandidate = [](SDValue N) {
31867 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
31868 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
31869 N->getOperand(1).getOpcode() == X86ISD::CMP &&
31870 isNullConstant(N->getOperand(1).getOperand(1)) &&
31871 N->getOperand(1).getValueType().bitsGE(MVT::i32);
31874 SDNode *OR = N->getOperand(0).getNode();
31875 SDValue LHS = OR->getOperand(0);
31876 SDValue RHS = OR->getOperand(1);
31878 // Save nodes matching or(or, setcc(eq, cmp 0)).
31879 SmallVector<SDNode *, 2> ORNodes;
31880 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
31881 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
31882 ORNodes.push_back(OR);
31883 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
31884 LHS = OR->getOperand(0);
31885 RHS = OR->getOperand(1);
31888 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
31889 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
31890 !isORCandidate(SDValue(OR, 0)))
31893 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
31895 // or(srl(ctlz),srl(ctlz)).
31896 // The dag combiner can then fold it into:
31897 // srl(or(ctlz, ctlz)).
31898 EVT VT = OR->getValueType(0);
31899 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
31900 SDValue Ret, NewRHS;
31901 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
31902 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
31907 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
31908 while (ORNodes.size() > 0) {
31909 OR = ORNodes.pop_back_val();
31910 LHS = OR->getOperand(0);
31911 RHS = OR->getOperand(1);
31912 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
31913 if (RHS->getOpcode() == ISD::OR)
31914 std::swap(LHS, RHS);
31915 EVT VT = OR->getValueType(0);
31916 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
31919 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
31923 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
31928 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
31929 TargetLowering::DAGCombinerInfo &DCI,
31930 const X86Subtarget &Subtarget) {
31931 if (DCI.isBeforeLegalizeOps())
31934 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31937 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31940 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
31943 SDValue N0 = N->getOperand(0);
31944 SDValue N1 = N->getOperand(1);
31945 EVT VT = N->getValueType(0);
31947 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
31950 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
31951 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
31953 // SHLD/SHRD instructions have lower register pressure, but on some
31954 // platforms they have higher latency than the equivalent
31955 // series of shifts/or that would otherwise be generated.
31956 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
31957 // have higher latencies and we are not optimizing for size.
31958 if (!OptForSize && Subtarget.isSHLDSlow())
31961 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
31963 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
31965 if (!N0.hasOneUse() || !N1.hasOneUse())
31968 SDValue ShAmt0 = N0.getOperand(1);
31969 if (ShAmt0.getValueType() != MVT::i8)
31971 SDValue ShAmt1 = N1.getOperand(1);
31972 if (ShAmt1.getValueType() != MVT::i8)
31974 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
31975 ShAmt0 = ShAmt0.getOperand(0);
31976 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
31977 ShAmt1 = ShAmt1.getOperand(0);
31980 unsigned Opc = X86ISD::SHLD;
31981 SDValue Op0 = N0.getOperand(0);
31982 SDValue Op1 = N1.getOperand(0);
31983 if (ShAmt0.getOpcode() == ISD::SUB ||
31984 ShAmt0.getOpcode() == ISD::XOR) {
31985 Opc = X86ISD::SHRD;
31986 std::swap(Op0, Op1);
31987 std::swap(ShAmt0, ShAmt1);
31990 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
31991 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
31992 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
31993 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
31994 unsigned Bits = VT.getSizeInBits();
31995 if (ShAmt1.getOpcode() == ISD::SUB) {
31996 SDValue Sum = ShAmt1.getOperand(0);
31997 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
31998 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
31999 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
32000 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
32001 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
32002 return DAG.getNode(Opc, DL, VT,
32004 DAG.getNode(ISD::TRUNCATE, DL,
32007 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
32008 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
32009 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
32010 return DAG.getNode(Opc, DL, VT,
32011 N0.getOperand(0), N1.getOperand(0),
32012 DAG.getNode(ISD::TRUNCATE, DL,
32014 } else if (ShAmt1.getOpcode() == ISD::XOR) {
32015 SDValue Mask = ShAmt1.getOperand(1);
32016 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
32017 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
32018 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
32019 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
32020 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
32021 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
32022 if (Op1.getOpcode() == InnerShift &&
32023 isa<ConstantSDNode>(Op1.getOperand(1)) &&
32024 Op1.getConstantOperandVal(1) == 1) {
32025 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32026 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32028 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
32029 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
32030 Op1.getOperand(0) == Op1.getOperand(1)) {
32031 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32032 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32041 /// Generate NEG and CMOV for integer abs.
32042 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
32043 EVT VT = N->getValueType(0);
32045 // Since X86 does not have CMOV for 8-bit integer, we don't convert
32046 // 8-bit integer abs to NEG and CMOV.
32047 if (VT.isInteger() && VT.getSizeInBits() == 8)
32050 SDValue N0 = N->getOperand(0);
32051 SDValue N1 = N->getOperand(1);
32054 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
32055 // and change it to SUB and CMOV.
32056 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
32057 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
32058 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
32059 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
32060 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
32061 // Generate SUB & CMOV.
32062 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
32063 DAG.getConstant(0, DL, VT), N0.getOperand(0));
32064 SDValue Ops[] = {N0.getOperand(0), Neg,
32065 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
32066 SDValue(Neg.getNode(), 1)};
32067 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
32073 /// Try to turn tests against the signbit in the form of:
32074 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
32077 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
32078 // This is only worth doing if the output type is i8 or i1.
32079 EVT ResultType = N->getValueType(0);
32080 if (ResultType != MVT::i8 && ResultType != MVT::i1)
32083 SDValue N0 = N->getOperand(0);
32084 SDValue N1 = N->getOperand(1);
32086 // We should be performing an xor against a truncated shift.
32087 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
32090 // Make sure we are performing an xor against one.
32091 if (!isOneConstant(N1))
32094 // SetCC on x86 zero extends so only act on this if it's a logical shift.
32095 SDValue Shift = N0.getOperand(0);
32096 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
32099 // Make sure we are truncating from one of i16, i32 or i64.
32100 EVT ShiftTy = Shift.getValueType();
32101 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
32104 // Make sure the shift amount extracts the sign bit.
32105 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
32106 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32109 // Create a greater-than comparison against -1.
32110 // N.B. Using SETGE against 0 works but we want a canonical looking
32111 // comparison, using SETGT matches up with what TranslateX86CC.
32113 SDValue ShiftOp = Shift.getOperand(0);
32114 EVT ShiftOpTy = ShiftOp.getValueType();
32115 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32116 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32117 *DAG.getContext(), ResultType);
32118 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32119 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32120 if (SetCCResultType != ResultType)
32121 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32125 /// Turn vector tests of the signbit in the form of:
32126 /// xor (sra X, elt_size(X)-1), -1
32130 /// This should be called before type legalization because the pattern may not
32131 /// persist after that.
32132 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32133 const X86Subtarget &Subtarget) {
32134 EVT VT = N->getValueType(0);
32135 if (!VT.isSimple())
32138 switch (VT.getSimpleVT().SimpleTy) {
32139 default: return SDValue();
32142 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32143 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32147 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32150 // There must be a shift right algebraic before the xor, and the xor must be a
32151 // 'not' operation.
32152 SDValue Shift = N->getOperand(0);
32153 SDValue Ones = N->getOperand(1);
32154 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
32155 !ISD::isBuildVectorAllOnes(Ones.getNode()))
32158 // The shift should be smearing the sign bit across each vector element.
32159 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32163 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32164 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32165 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32168 // Create a greater-than comparison against -1. We don't use the more obvious
32169 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32170 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32173 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
32174 /// is valid for the given \p Subtarget.
32175 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32176 const X86Subtarget &Subtarget) {
32177 if (!Subtarget.hasAVX512())
32180 // FIXME: Scalar type may be supported if we move it to vector register.
32181 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
32184 EVT SrcElVT = SrcVT.getScalarType();
32185 EVT DstElVT = DstVT.getScalarType();
32186 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
32188 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
32190 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
32191 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
32195 /// Detect a pattern of truncation with saturation:
32196 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32197 /// Return the source value to be truncated or SDValue() if the pattern was not
32199 static SDValue detectUSatPattern(SDValue In, EVT VT) {
32200 if (In.getOpcode() != ISD::UMIN)
32203 //Saturation with truncation. We truncate from InVT to VT.
32204 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32205 "Unexpected types for truncate operation");
32208 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
32209 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32210 // the element size of the destination type.
32211 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32217 /// Detect a pattern of truncation with saturation:
32218 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32219 /// The types should allow to use VPMOVUS* instruction on AVX512.
32220 /// Return the source value to be truncated or SDValue() if the pattern was not
32222 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32223 const X86Subtarget &Subtarget) {
32224 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32226 return detectUSatPattern(In, VT);
32230 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32231 const X86Subtarget &Subtarget) {
32232 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32233 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
32235 if (auto USatVal = detectUSatPattern(In, VT))
32236 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32237 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32241 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
32242 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32243 /// X86ISD::AVG instruction.
32244 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32245 const X86Subtarget &Subtarget,
32247 if (!VT.isVector() || !VT.isSimple())
32249 EVT InVT = In.getValueType();
32250 unsigned NumElems = VT.getVectorNumElements();
32252 EVT ScalarVT = VT.getVectorElementType();
32253 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
32254 isPowerOf2_32(NumElems)))
32257 // InScalarVT is the intermediate type in AVG pattern and it should be greater
32258 // than the original input type (i8/i16).
32259 EVT InScalarVT = InVT.getVectorElementType();
32260 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32263 if (!Subtarget.hasSSE2())
32265 if (Subtarget.hasBWI()) {
32266 if (VT.getSizeInBits() > 512)
32268 } else if (Subtarget.hasAVX2()) {
32269 if (VT.getSizeInBits() > 256)
32272 if (VT.getSizeInBits() > 128)
32276 // Detect the following pattern:
32278 // %1 = zext <N x i8> %a to <N x i32>
32279 // %2 = zext <N x i8> %b to <N x i32>
32280 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32281 // %4 = add nuw nsw <N x i32> %3, %2
32282 // %5 = lshr <N x i32> %N, <i32 1 x N>
32283 // %6 = trunc <N x i32> %5 to <N x i8>
32285 // In AVX512, the last instruction can also be a trunc store.
32287 if (In.getOpcode() != ISD::SRL)
32290 // A lambda checking the given SDValue is a constant vector and each element
32291 // is in the range [Min, Max].
32292 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32293 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32294 if (!BV || !BV->isConstant())
32296 for (SDValue Op : V->ops()) {
32297 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32300 uint64_t Val = C->getZExtValue();
32301 if (Val < Min || Val > Max)
32307 // Check if each element of the vector is left-shifted by one.
32308 auto LHS = In.getOperand(0);
32309 auto RHS = In.getOperand(1);
32310 if (!IsConstVectorInRange(RHS, 1, 1))
32312 if (LHS.getOpcode() != ISD::ADD)
32315 // Detect a pattern of a + b + 1 where the order doesn't matter.
32316 SDValue Operands[3];
32317 Operands[0] = LHS.getOperand(0);
32318 Operands[1] = LHS.getOperand(1);
32320 // Take care of the case when one of the operands is a constant vector whose
32321 // element is in the range [1, 256].
32322 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32323 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32324 Operands[0].getOperand(0).getValueType() == VT) {
32325 // The pattern is detected. Subtract one from the constant vector, then
32326 // demote it and emit X86ISD::AVG instruction.
32327 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32328 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32329 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32330 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32334 if (Operands[0].getOpcode() == ISD::ADD)
32335 std::swap(Operands[0], Operands[1]);
32336 else if (Operands[1].getOpcode() != ISD::ADD)
32338 Operands[2] = Operands[1].getOperand(0);
32339 Operands[1] = Operands[1].getOperand(1);
32341 // Now we have three operands of two additions. Check that one of them is a
32342 // constant vector with ones, and the other two are promoted from i8/i16.
32343 for (int i = 0; i < 3; ++i) {
32344 if (!IsConstVectorInRange(Operands[i], 1, 1))
32346 std::swap(Operands[i], Operands[2]);
32348 // Check if Operands[0] and Operands[1] are results of type promotion.
32349 for (int j = 0; j < 2; ++j)
32350 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32351 Operands[j].getOperand(0).getValueType() != VT)
32354 // The pattern is detected, emit X86ISD::AVG instruction.
32355 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32356 Operands[1].getOperand(0));
32362 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32363 TargetLowering::DAGCombinerInfo &DCI,
32364 const X86Subtarget &Subtarget) {
32365 LoadSDNode *Ld = cast<LoadSDNode>(N);
32366 EVT RegVT = Ld->getValueType(0);
32367 EVT MemVT = Ld->getMemoryVT();
32369 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32371 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32372 // into two 16-byte operations.
32373 ISD::LoadExtType Ext = Ld->getExtensionType();
32375 unsigned AddressSpace = Ld->getAddressSpace();
32376 unsigned Alignment = Ld->getAlignment();
32377 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32378 Ext == ISD::NON_EXTLOAD &&
32379 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32380 AddressSpace, Alignment, &Fast) && !Fast) {
32381 unsigned NumElems = RegVT.getVectorNumElements();
32385 SDValue Ptr = Ld->getBasePtr();
32387 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32390 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32391 Alignment, Ld->getMemOperand()->getFlags());
32393 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32395 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32396 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32397 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32399 Load2.getValue(1));
32401 SDValue NewVec = DAG.getUNDEF(RegVT);
32402 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32403 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32404 return DCI.CombineTo(N, NewVec, TF, true);
32410 /// If V is a build vector of boolean constants and exactly one of those
32411 /// constants is true, return the operand index of that true element.
32412 /// Otherwise, return -1.
32413 static int getOneTrueElt(SDValue V) {
32414 // This needs to be a build vector of booleans.
32415 // TODO: Checking for the i1 type matches the IR definition for the mask,
32416 // but the mask check could be loosened to i8 or other types. That might
32417 // also require checking more than 'allOnesValue'; eg, the x86 HW
32418 // instructions only require that the MSB is set for each mask element.
32419 // The ISD::MSTORE comments/definition do not specify how the mask operand
32421 auto *BV = dyn_cast<BuildVectorSDNode>(V);
32422 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
32425 int TrueIndex = -1;
32426 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32427 for (unsigned i = 0; i < NumElts; ++i) {
32428 const SDValue &Op = BV->getOperand(i);
32431 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32434 if (ConstNode->getAPIntValue().isAllOnesValue()) {
32435 // If we already found a one, this is too many.
32436 if (TrueIndex >= 0)
32444 /// Given a masked memory load/store operation, return true if it has one mask
32445 /// bit set. If it has one mask bit set, then also return the memory address of
32446 /// the scalar element to load/store, the vector index to insert/extract that
32447 /// scalar element, and the alignment for the scalar memory access.
32448 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32449 SelectionDAG &DAG, SDValue &Addr,
32450 SDValue &Index, unsigned &Alignment) {
32451 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32452 if (TrueMaskElt < 0)
32455 // Get the address of the one scalar element that is specified by the mask
32456 // using the appropriate offset from the base pointer.
32457 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32458 Addr = MaskedOp->getBasePtr();
32459 if (TrueMaskElt != 0) {
32460 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32461 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32464 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32465 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32469 /// If exactly one element of the mask is set for a non-extending masked load,
32470 /// it is a scalar load and vector insert.
32471 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32472 /// mask have already been optimized in IR, so we don't bother with those here.
32474 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32475 TargetLowering::DAGCombinerInfo &DCI) {
32476 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32477 // However, some target hooks may need to be added to know when the transform
32478 // is profitable. Endianness would also have to be considered.
32480 SDValue Addr, VecIndex;
32481 unsigned Alignment;
32482 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32485 // Load the one scalar element that is specified by the mask using the
32486 // appropriate offset from the base pointer.
32488 EVT VT = ML->getValueType(0);
32489 EVT EltVT = VT.getVectorElementType();
32491 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32492 Alignment, ML->getMemOperand()->getFlags());
32494 // Insert the loaded element into the appropriate place in the vector.
32495 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32497 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32501 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32502 TargetLowering::DAGCombinerInfo &DCI) {
32503 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32507 EVT VT = ML->getValueType(0);
32509 // If we are loading the first and last elements of a vector, it is safe and
32510 // always faster to load the whole vector. Replace the masked load with a
32511 // vector load and select.
32512 unsigned NumElts = VT.getVectorNumElements();
32513 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
32514 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
32515 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
32516 if (LoadFirstElt && LoadLastElt) {
32517 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32518 ML->getMemOperand());
32519 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
32520 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
32523 // Convert a masked load with a constant mask into a masked load and a select.
32524 // This allows the select operation to use a faster kind of select instruction
32525 // (for example, vblendvps -> vblendps).
32527 // Don't try this if the pass-through operand is already undefined. That would
32528 // cause an infinite loop because that's what we're about to create.
32529 if (ML->getSrc0().isUndef())
32532 // The new masked load has an undef pass-through operand. The select uses the
32533 // original pass-through operand.
32534 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32535 ML->getMask(), DAG.getUNDEF(VT),
32536 ML->getMemoryVT(), ML->getMemOperand(),
32537 ML->getExtensionType());
32538 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
32540 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
32543 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
32544 TargetLowering::DAGCombinerInfo &DCI,
32545 const X86Subtarget &Subtarget) {
32546 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
32548 // TODO: Expanding load with constant mask may be optimized as well.
32549 if (Mld->isExpandingLoad())
32552 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
32553 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
32555 // TODO: Do some AVX512 subsets benefit from this transform?
32556 if (!Subtarget.hasAVX512())
32557 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
32561 if (Mld->getExtensionType() != ISD::SEXTLOAD)
32564 // Resolve extending loads.
32565 EVT VT = Mld->getValueType(0);
32566 unsigned NumElems = VT.getVectorNumElements();
32567 EVT LdVT = Mld->getMemoryVT();
32570 assert(LdVT != VT && "Cannot extend to the same type");
32571 unsigned ToSz = VT.getScalarSizeInBits();
32572 unsigned FromSz = LdVT.getScalarSizeInBits();
32573 // From/To sizes and ElemCount must be pow of two.
32574 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32575 "Unexpected size for extending masked load");
32577 unsigned SizeRatio = ToSz / FromSz;
32578 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
32580 // Create a type on which we perform the shuffle.
32581 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32582 LdVT.getScalarType(), NumElems*SizeRatio);
32583 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32585 // Convert Src0 value.
32586 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
32587 if (!Mld->getSrc0().isUndef()) {
32588 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32589 for (unsigned i = 0; i != NumElems; ++i)
32590 ShuffleVec[i] = i * SizeRatio;
32592 // Can't shuffle using an illegal type.
32593 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32594 "WideVecVT should be legal");
32595 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
32596 DAG.getUNDEF(WideVecVT), ShuffleVec);
32598 // Prepare the new mask.
32600 SDValue Mask = Mld->getMask();
32601 if (Mask.getValueType() == VT) {
32602 // Mask and original value have the same type.
32603 NewMask = DAG.getBitcast(WideVecVT, Mask);
32604 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32605 for (unsigned i = 0; i != NumElems; ++i)
32606 ShuffleVec[i] = i * SizeRatio;
32607 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
32608 ShuffleVec[i] = NumElems * SizeRatio;
32609 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32610 DAG.getConstant(0, dl, WideVecVT),
32613 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32614 unsigned WidenNumElts = NumElems*SizeRatio;
32615 unsigned MaskNumElts = VT.getVectorNumElements();
32616 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32619 unsigned NumConcat = WidenNumElts / MaskNumElts;
32620 SmallVector<SDValue, 16> Ops(NumConcat);
32621 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32623 for (unsigned i = 1; i != NumConcat; ++i)
32626 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32629 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
32630 Mld->getBasePtr(), NewMask, WideSrc0,
32631 Mld->getMemoryVT(), Mld->getMemOperand(),
32633 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
32634 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
32637 /// If exactly one element of the mask is set for a non-truncating masked store,
32638 /// it is a vector extract and scalar store.
32639 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32640 /// mask have already been optimized in IR, so we don't bother with those here.
32641 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
32642 SelectionDAG &DAG) {
32643 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32644 // However, some target hooks may need to be added to know when the transform
32645 // is profitable. Endianness would also have to be considered.
32647 SDValue Addr, VecIndex;
32648 unsigned Alignment;
32649 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
32652 // Extract the one scalar element that is actually being stored.
32654 EVT VT = MS->getValue().getValueType();
32655 EVT EltVT = VT.getVectorElementType();
32656 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
32657 MS->getValue(), VecIndex);
32659 // Store that element at the appropriate offset from the base pointer.
32660 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
32661 Alignment, MS->getMemOperand()->getFlags());
32664 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
32665 const X86Subtarget &Subtarget) {
32666 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
32668 if (Mst->isCompressingStore())
32671 if (!Mst->isTruncatingStore())
32672 return reduceMaskedStoreToScalarStore(Mst, DAG);
32674 // Resolve truncating stores.
32675 EVT VT = Mst->getValue().getValueType();
32676 unsigned NumElems = VT.getVectorNumElements();
32677 EVT StVT = Mst->getMemoryVT();
32680 assert(StVT != VT && "Cannot truncate to the same type");
32681 unsigned FromSz = VT.getScalarSizeInBits();
32682 unsigned ToSz = StVT.getScalarSizeInBits();
32684 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32686 // The truncating store is legal in some cases. For example
32687 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32688 // are designated for truncate store.
32689 // In this case we don't need any further transformations.
32690 if (TLI.isTruncStoreLegal(VT, StVT))
32693 // From/To sizes and ElemCount must be pow of two.
32694 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32695 "Unexpected size for truncating masked store");
32696 // We are going to use the original vector elt for storing.
32697 // Accumulated smaller vector elements must be a multiple of the store size.
32698 assert (((NumElems * FromSz) % ToSz) == 0 &&
32699 "Unexpected ratio for truncating masked store");
32701 unsigned SizeRatio = FromSz / ToSz;
32702 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32704 // Create a type on which we perform the shuffle.
32705 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32706 StVT.getScalarType(), NumElems*SizeRatio);
32708 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32710 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
32711 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32712 for (unsigned i = 0; i != NumElems; ++i)
32713 ShuffleVec[i] = i * SizeRatio;
32715 // Can't shuffle using an illegal type.
32716 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32717 "WideVecVT should be legal");
32719 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32720 DAG.getUNDEF(WideVecVT),
32724 SDValue Mask = Mst->getMask();
32725 if (Mask.getValueType() == VT) {
32726 // Mask and original value have the same type.
32727 NewMask = DAG.getBitcast(WideVecVT, Mask);
32728 for (unsigned i = 0; i != NumElems; ++i)
32729 ShuffleVec[i] = i * SizeRatio;
32730 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
32731 ShuffleVec[i] = NumElems*SizeRatio;
32732 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32733 DAG.getConstant(0, dl, WideVecVT),
32736 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32737 unsigned WidenNumElts = NumElems*SizeRatio;
32738 unsigned MaskNumElts = VT.getVectorNumElements();
32739 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32742 unsigned NumConcat = WidenNumElts / MaskNumElts;
32743 SmallVector<SDValue, 16> Ops(NumConcat);
32744 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32746 for (unsigned i = 1; i != NumConcat; ++i)
32749 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32752 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
32753 Mst->getBasePtr(), NewMask, StVT,
32754 Mst->getMemOperand(), false);
32757 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
32758 const X86Subtarget &Subtarget) {
32759 StoreSDNode *St = cast<StoreSDNode>(N);
32760 EVT VT = St->getValue().getValueType();
32761 EVT StVT = St->getMemoryVT();
32763 SDValue StoredVal = St->getOperand(1);
32764 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32766 // If we are saving a concatenation of two XMM registers and 32-byte stores
32767 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
32769 unsigned AddressSpace = St->getAddressSpace();
32770 unsigned Alignment = St->getAlignment();
32771 if (VT.is256BitVector() && StVT == VT &&
32772 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
32773 AddressSpace, Alignment, &Fast) &&
32775 unsigned NumElems = VT.getVectorNumElements();
32779 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
32780 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
32782 SDValue Ptr0 = St->getBasePtr();
32783 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
32786 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
32787 Alignment, St->getMemOperand()->getFlags());
32789 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
32790 std::min(16U, Alignment), St->getMemOperand()->getFlags());
32791 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
32794 // Optimize trunc store (of multiple scalars) to shuffle and store.
32795 // First, pack all of the elements in one place. Next, store to memory
32796 // in fewer chunks.
32797 if (St->isTruncatingStore() && VT.isVector()) {
32798 // Check if we can detect an AVG pattern from the truncation. If yes,
32799 // replace the trunc store by a normal store with the result of X86ISD::AVG
32801 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
32803 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
32804 St->getPointerInfo(), St->getAlignment(),
32805 St->getMemOperand()->getFlags());
32808 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
32809 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
32810 dl, Val, St->getBasePtr(),
32811 St->getMemoryVT(), St->getMemOperand(), DAG);
32813 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32814 unsigned NumElems = VT.getVectorNumElements();
32815 assert(StVT != VT && "Cannot truncate to the same type");
32816 unsigned FromSz = VT.getScalarSizeInBits();
32817 unsigned ToSz = StVT.getScalarSizeInBits();
32819 // The truncating store is legal in some cases. For example
32820 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32821 // are designated for truncate store.
32822 // In this case we don't need any further transformations.
32823 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
32826 // From, To sizes and ElemCount must be pow of two
32827 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
32828 // We are going to use the original vector elt for storing.
32829 // Accumulated smaller vector elements must be a multiple of the store size.
32830 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
32832 unsigned SizeRatio = FromSz / ToSz;
32834 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32836 // Create a type on which we perform the shuffle
32837 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32838 StVT.getScalarType(), NumElems*SizeRatio);
32840 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32842 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
32843 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
32844 for (unsigned i = 0; i != NumElems; ++i)
32845 ShuffleVec[i] = i * SizeRatio;
32847 // Can't shuffle using an illegal type.
32848 if (!TLI.isTypeLegal(WideVecVT))
32851 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32852 DAG.getUNDEF(WideVecVT),
32854 // At this point all of the data is stored at the bottom of the
32855 // register. We now need to save it to mem.
32857 // Find the largest store unit
32858 MVT StoreType = MVT::i8;
32859 for (MVT Tp : MVT::integer_valuetypes()) {
32860 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
32864 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
32865 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
32866 (64 <= NumElems * ToSz))
32867 StoreType = MVT::f64;
32869 // Bitcast the original vector into a vector of store-size units
32870 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
32871 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
32872 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
32873 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
32874 SmallVector<SDValue, 8> Chains;
32875 SDValue Ptr = St->getBasePtr();
32877 // Perform one or more big stores into memory.
32878 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
32879 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
32880 StoreType, ShuffWide,
32881 DAG.getIntPtrConstant(i, dl));
32883 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
32884 St->getAlignment(), St->getMemOperand()->getFlags());
32885 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
32886 Chains.push_back(Ch);
32889 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
32892 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
32893 // the FP state in cases where an emms may be missing.
32894 // A preferable solution to the general problem is to figure out the right
32895 // places to insert EMMS. This qualifies as a quick hack.
32897 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
32898 if (VT.getSizeInBits() != 64)
32901 const Function *F = DAG.getMachineFunction().getFunction();
32902 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
32904 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
32905 if ((VT.isVector() ||
32906 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
32907 isa<LoadSDNode>(St->getValue()) &&
32908 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
32909 St->getChain().hasOneUse() && !St->isVolatile()) {
32910 SDNode* LdVal = St->getValue().getNode();
32911 LoadSDNode *Ld = nullptr;
32912 int TokenFactorIndex = -1;
32913 SmallVector<SDValue, 8> Ops;
32914 SDNode* ChainVal = St->getChain().getNode();
32915 // Must be a store of a load. We currently handle two cases: the load
32916 // is a direct child, and it's under an intervening TokenFactor. It is
32917 // possible to dig deeper under nested TokenFactors.
32918 if (ChainVal == LdVal)
32919 Ld = cast<LoadSDNode>(St->getChain());
32920 else if (St->getValue().hasOneUse() &&
32921 ChainVal->getOpcode() == ISD::TokenFactor) {
32922 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
32923 if (ChainVal->getOperand(i).getNode() == LdVal) {
32924 TokenFactorIndex = i;
32925 Ld = cast<LoadSDNode>(St->getValue());
32927 Ops.push_back(ChainVal->getOperand(i));
32931 if (!Ld || !ISD::isNormalLoad(Ld))
32934 // If this is not the MMX case, i.e. we are just turning i64 load/store
32935 // into f64 load/store, avoid the transformation if there are multiple
32936 // uses of the loaded value.
32937 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
32942 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
32943 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
32945 if (Subtarget.is64Bit() || F64IsLegal) {
32946 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
32947 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
32948 Ld->getPointerInfo(), Ld->getAlignment(),
32949 Ld->getMemOperand()->getFlags());
32950 SDValue NewChain = NewLd.getValue(1);
32951 if (TokenFactorIndex >= 0) {
32952 Ops.push_back(NewChain);
32953 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32955 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
32956 St->getPointerInfo(), St->getAlignment(),
32957 St->getMemOperand()->getFlags());
32960 // Otherwise, lower to two pairs of 32-bit loads / stores.
32961 SDValue LoAddr = Ld->getBasePtr();
32962 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
32964 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
32965 Ld->getPointerInfo(), Ld->getAlignment(),
32966 Ld->getMemOperand()->getFlags());
32967 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
32968 Ld->getPointerInfo().getWithOffset(4),
32969 MinAlign(Ld->getAlignment(), 4),
32970 Ld->getMemOperand()->getFlags());
32972 SDValue NewChain = LoLd.getValue(1);
32973 if (TokenFactorIndex >= 0) {
32974 Ops.push_back(LoLd);
32975 Ops.push_back(HiLd);
32976 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32979 LoAddr = St->getBasePtr();
32980 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
32983 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
32984 St->getAlignment(), St->getMemOperand()->getFlags());
32985 SDValue HiSt = DAG.getStore(
32986 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
32987 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
32988 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
32991 // This is similar to the above case, but here we handle a scalar 64-bit
32992 // integer store that is extracted from a vector on a 32-bit target.
32993 // If we have SSE2, then we can treat it like a floating-point double
32994 // to get past legalization. The execution dependencies fixup pass will
32995 // choose the optimal machine instruction for the store if this really is
32996 // an integer or v2f32 rather than an f64.
32997 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
32998 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
32999 SDValue OldExtract = St->getOperand(1);
33000 SDValue ExtOp0 = OldExtract.getOperand(0);
33001 unsigned VecSize = ExtOp0.getValueSizeInBits();
33002 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
33003 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
33004 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
33005 BitCast, OldExtract.getOperand(1));
33006 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
33007 St->getPointerInfo(), St->getAlignment(),
33008 St->getMemOperand()->getFlags());
33014 /// Return 'true' if this vector operation is "horizontal"
33015 /// and return the operands for the horizontal operation in LHS and RHS. A
33016 /// horizontal operation performs the binary operation on successive elements
33017 /// of its first operand, then on successive elements of its second operand,
33018 /// returning the resulting values in a vector. For example, if
33019 /// A = < float a0, float a1, float a2, float a3 >
33021 /// B = < float b0, float b1, float b2, float b3 >
33022 /// then the result of doing a horizontal operation on A and B is
33023 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
33024 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
33025 /// A horizontal-op B, for some already available A and B, and if so then LHS is
33026 /// set to A, RHS to B, and the routine returns 'true'.
33027 /// Note that the binary operation should have the property that if one of the
33028 /// operands is UNDEF then the result is UNDEF.
33029 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
33030 // Look for the following pattern: if
33031 // A = < float a0, float a1, float a2, float a3 >
33032 // B = < float b0, float b1, float b2, float b3 >
33034 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
33035 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
33036 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
33037 // which is A horizontal-op B.
33039 // At least one of the operands should be a vector shuffle.
33040 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
33041 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
33044 MVT VT = LHS.getSimpleValueType();
33046 assert((VT.is128BitVector() || VT.is256BitVector()) &&
33047 "Unsupported vector type for horizontal add/sub");
33049 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
33050 // operate independently on 128-bit lanes.
33051 unsigned NumElts = VT.getVectorNumElements();
33052 unsigned NumLanes = VT.getSizeInBits()/128;
33053 unsigned NumLaneElts = NumElts / NumLanes;
33054 assert((NumLaneElts % 2 == 0) &&
33055 "Vector type should have an even number of elements in each lane");
33056 unsigned HalfLaneElts = NumLaneElts/2;
33058 // View LHS in the form
33059 // LHS = VECTOR_SHUFFLE A, B, LMask
33060 // If LHS is not a shuffle then pretend it is the shuffle
33061 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
33062 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
33065 SmallVector<int, 16> LMask(NumElts);
33066 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33067 if (!LHS.getOperand(0).isUndef())
33068 A = LHS.getOperand(0);
33069 if (!LHS.getOperand(1).isUndef())
33070 B = LHS.getOperand(1);
33071 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
33072 std::copy(Mask.begin(), Mask.end(), LMask.begin());
33074 if (!LHS.isUndef())
33076 for (unsigned i = 0; i != NumElts; ++i)
33080 // Likewise, view RHS in the form
33081 // RHS = VECTOR_SHUFFLE C, D, RMask
33083 SmallVector<int, 16> RMask(NumElts);
33084 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33085 if (!RHS.getOperand(0).isUndef())
33086 C = RHS.getOperand(0);
33087 if (!RHS.getOperand(1).isUndef())
33088 D = RHS.getOperand(1);
33089 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33090 std::copy(Mask.begin(), Mask.end(), RMask.begin());
33092 if (!RHS.isUndef())
33094 for (unsigned i = 0; i != NumElts; ++i)
33098 // Check that the shuffles are both shuffling the same vectors.
33099 if (!(A == C && B == D) && !(A == D && B == C))
33102 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33103 if (!A.getNode() && !B.getNode())
33106 // If A and B occur in reverse order in RHS, then "swap" them (which means
33107 // rewriting the mask).
33109 ShuffleVectorSDNode::commuteMask(RMask);
33111 // At this point LHS and RHS are equivalent to
33112 // LHS = VECTOR_SHUFFLE A, B, LMask
33113 // RHS = VECTOR_SHUFFLE A, B, RMask
33114 // Check that the masks correspond to performing a horizontal operation.
33115 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
33116 for (unsigned i = 0; i != NumLaneElts; ++i) {
33117 int LIdx = LMask[i+l], RIdx = RMask[i+l];
33119 // Ignore any UNDEF components.
33120 if (LIdx < 0 || RIdx < 0 ||
33121 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
33122 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
33125 // Check that successive elements are being operated on. If not, this is
33126 // not a horizontal operation.
33127 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33128 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
33129 if (!(LIdx == Index && RIdx == Index + 1) &&
33130 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
33135 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
33136 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33140 /// Do target-specific dag combines on floating-point adds/subs.
33141 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33142 const X86Subtarget &Subtarget) {
33143 EVT VT = N->getValueType(0);
33144 SDValue LHS = N->getOperand(0);
33145 SDValue RHS = N->getOperand(1);
33146 bool IsFadd = N->getOpcode() == ISD::FADD;
33147 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33149 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33150 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
33151 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
33152 isHorizontalBinOp(LHS, RHS, IsFadd)) {
33153 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33154 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33159 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33161 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33162 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33163 const X86Subtarget &Subtarget,
33165 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33166 SDValue Src = N->getOperand(0);
33167 unsigned Opcode = Src.getOpcode();
33168 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33170 EVT VT = N->getValueType(0);
33171 EVT SrcVT = Src.getValueType();
33173 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33174 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33176 // Repeated operand, so we are only trading one output truncation for
33177 // one input truncation.
33181 // See if either operand has been extended from a smaller/equal size to
33182 // the truncation size, allowing a truncation to combine with the extend.
33183 unsigned Opcode0 = Op0.getOpcode();
33184 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
33185 Opcode0 == ISD::ZERO_EXTEND) &&
33186 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33189 unsigned Opcode1 = Op1.getOpcode();
33190 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
33191 Opcode1 == ISD::ZERO_EXTEND) &&
33192 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33195 // See if either operand is a single use constant which can be constant
33197 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33198 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33199 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33200 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33203 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33204 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33205 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33206 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33209 // Don't combine if the operation has other uses.
33210 if (!N->isOnlyUserOf(Src.getNode()))
33213 // Only support vector truncation for now.
33214 // TODO: i64 scalar math would benefit as well.
33215 if (!VT.isVector())
33218 // In most cases its only worth pre-truncating if we're only facing the cost
33219 // of one truncation.
33220 // i.e. if one of the inputs will constant fold or the input is repeated.
33225 SDValue Op0 = Src.getOperand(0);
33226 SDValue Op1 = Src.getOperand(1);
33227 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33228 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33229 return TruncateArithmetic(Op0, Op1);
33234 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33235 // better to truncate if we have the chance.
33236 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33237 !TLI.isOperationLegal(Opcode, SrcVT))
33238 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33241 SDValue Op0 = Src.getOperand(0);
33242 SDValue Op1 = Src.getOperand(1);
33243 if (TLI.isOperationLegal(Opcode, VT) &&
33244 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33245 return TruncateArithmetic(Op0, Op1);
33253 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33255 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33256 SmallVector<SDValue, 8> &Regs) {
33257 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33258 Regs[0].getValueType() == MVT::v2i64));
33259 EVT OutVT = N->getValueType(0);
33260 EVT OutSVT = OutVT.getVectorElementType();
33261 EVT InVT = Regs[0].getValueType();
33262 EVT InSVT = InVT.getVectorElementType();
33265 // First, use mask to unset all bits that won't appear in the result.
33266 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33267 "OutSVT can only be either i8 or i16.");
33269 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33270 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33271 for (auto &Reg : Regs)
33272 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33274 MVT UnpackedVT, PackedVT;
33275 if (OutSVT == MVT::i8) {
33276 UnpackedVT = MVT::v8i16;
33277 PackedVT = MVT::v16i8;
33279 UnpackedVT = MVT::v4i32;
33280 PackedVT = MVT::v8i16;
33283 // In each iteration, truncate the type by a half size.
33284 auto RegNum = Regs.size();
33285 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33286 j < e; j *= 2, RegNum /= 2) {
33287 for (unsigned i = 0; i < RegNum; i++)
33288 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33289 for (unsigned i = 0; i < RegNum / 2; i++)
33290 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33294 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33295 // then extract a subvector as the result since v8i8 is not a legal type.
33296 if (OutVT == MVT::v8i8) {
33297 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33298 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33299 DAG.getIntPtrConstant(0, DL));
33301 } else if (RegNum > 1) {
33302 Regs.resize(RegNum);
33303 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33308 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33310 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33312 SmallVector<SDValue, 8> &Regs) {
33313 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33314 EVT OutVT = N->getValueType(0);
33317 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33318 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33319 for (auto &Reg : Regs) {
33320 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33322 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33326 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33327 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33330 if (Regs.size() > 2) {
33331 Regs.resize(Regs.size() / 2);
33332 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33337 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33338 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33339 /// legalization the truncation will be translated into a BUILD_VECTOR with each
33340 /// element that is extracted from a vector and then truncated, and it is
33341 /// difficult to do this optimization based on them.
33342 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33343 const X86Subtarget &Subtarget) {
33344 EVT OutVT = N->getValueType(0);
33345 if (!OutVT.isVector())
33348 SDValue In = N->getOperand(0);
33349 if (!In.getValueType().isSimple())
33352 EVT InVT = In.getValueType();
33353 unsigned NumElems = OutVT.getVectorNumElements();
33355 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33356 // SSE2, and we need to take care of it specially.
33357 // AVX512 provides vpmovdb.
33358 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
33361 EVT OutSVT = OutVT.getVectorElementType();
33362 EVT InSVT = InVT.getVectorElementType();
33363 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
33364 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33368 // SSSE3's pshufb results in less instructions in the cases below.
33369 if (Subtarget.hasSSSE3() && NumElems == 8 &&
33370 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
33371 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
33376 // Split a long vector into vectors of legal type.
33377 unsigned RegNum = InVT.getSizeInBits() / 128;
33378 SmallVector<SDValue, 8> SubVec(RegNum);
33379 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33380 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33382 for (unsigned i = 0; i < RegNum; i++)
33383 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33384 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33386 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33387 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33388 // truncate 2 x v4i32 to v8i16.
33389 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
33390 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33391 else if (InSVT == MVT::i32)
33392 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33397 /// This function transforms vector truncation of 'all or none' bits values.
33398 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33399 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33401 const X86Subtarget &Subtarget) {
33402 // Requires SSE2 but AVX512 has fast truncate.
33403 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
33406 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
33409 SDValue In = N->getOperand(0);
33410 if (!In.getValueType().isSimple())
33413 MVT VT = N->getValueType(0).getSimpleVT();
33414 MVT SVT = VT.getScalarType();
33416 MVT InVT = In.getValueType().getSimpleVT();
33417 MVT InSVT = InVT.getScalarType();
33419 // Use PACKSS if the input is a splatted sign bit.
33420 // e.g. Comparison result, sext_in_reg, etc.
33421 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33422 if (NumSignBits != InSVT.getSizeInBits())
33425 // Check we have a truncation suited for PACKSS.
33426 if (!VT.is128BitVector() && !VT.is256BitVector())
33428 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33430 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33433 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33436 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33437 const X86Subtarget &Subtarget) {
33438 EVT VT = N->getValueType(0);
33439 SDValue Src = N->getOperand(0);
33442 // Attempt to pre-truncate inputs to arithmetic ops instead.
33443 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33446 // Try to detect AVG pattern first.
33447 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33450 // Try to combine truncation with unsigned saturation.
33451 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33454 // The bitcast source is a direct mmx result.
33455 // Detect bitcasts between i32 to x86mmx
33456 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33457 SDValue BCSrc = Src.getOperand(0);
33458 if (BCSrc.getValueType() == MVT::x86mmx)
33459 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33462 // Try to truncate extended sign bits with PACKSS.
33463 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33466 return combineVectorTruncation(N, DAG, Subtarget);
33469 /// Returns the negated value if the node \p N flips sign of FP value.
33471 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33472 /// AVX512F does not have FXOR, so FNEG is lowered as
33473 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33474 /// In this case we go though all bitcasts.
33475 static SDValue isFNEG(SDNode *N) {
33476 if (N->getOpcode() == ISD::FNEG)
33477 return N->getOperand(0);
33479 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33480 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33483 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33484 if (!Op1.getValueType().isFloatingPoint())
33487 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33489 unsigned EltBits = Op1.getScalarValueSizeInBits();
33490 auto isSignMask = [&](const ConstantFP *C) {
33491 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33494 // There is more than one way to represent the same constant on
33495 // the different X86 targets. The type of the node may also depend on size.
33496 // - load scalar value and broadcast
33497 // - BUILD_VECTOR node
33498 // - load from a constant pool.
33499 // We check all variants here.
33500 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33501 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33502 if (isSignMask(cast<ConstantFP>(C)))
33505 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33506 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33507 if (isSignMask(CN->getConstantFPValue()))
33510 } else if (auto *C = getTargetConstantFromNode(Op1)) {
33511 if (C->getType()->isVectorTy()) {
33512 if (auto *SplatV = C->getSplatValue())
33513 if (isSignMask(cast<ConstantFP>(SplatV)))
33515 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
33516 if (isSignMask(FPConst))
33522 /// Do target-specific dag combines on floating point negations.
33523 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
33524 const X86Subtarget &Subtarget) {
33525 EVT OrigVT = N->getValueType(0);
33526 SDValue Arg = isFNEG(N);
33527 assert(Arg.getNode() && "N is expected to be an FNEG node");
33529 EVT VT = Arg.getValueType();
33530 EVT SVT = VT.getScalarType();
33533 // Let legalize expand this if it isn't a legal type yet.
33534 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33537 // If we're negating a FMUL node on a target with FMA, then we can avoid the
33538 // use of a constant by performing (-0 - A*B) instead.
33539 // FIXME: Check rounding control flags as well once it becomes available.
33540 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
33541 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
33542 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
33543 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
33544 Arg.getOperand(1), Zero);
33545 return DAG.getBitcast(OrigVT, NewNode);
33548 // If we're negating an FMA node, then we can adjust the
33549 // instruction to include the extra negation.
33550 unsigned NewOpcode = 0;
33551 if (Arg.hasOneUse()) {
33552 switch (Arg.getOpcode()) {
33553 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
33554 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
33555 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
33556 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
33557 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
33558 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
33559 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
33560 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
33561 // We can't handle scalar intrinsic node here because it would only
33562 // invert one element and not the whole vector. But we could try to handle
33563 // a negation of the lower element only.
33567 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
33568 Arg.getNode()->ops()));
33573 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
33574 const X86Subtarget &Subtarget) {
33575 MVT VT = N->getSimpleValueType(0);
33576 // If we have integer vector types available, use the integer opcodes.
33577 if (VT.isVector() && Subtarget.hasSSE2()) {
33580 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
33582 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
33583 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
33584 unsigned IntOpcode;
33585 switch (N->getOpcode()) {
33586 default: llvm_unreachable("Unexpected FP logic op");
33587 case X86ISD::FOR: IntOpcode = ISD::OR; break;
33588 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
33589 case X86ISD::FAND: IntOpcode = ISD::AND; break;
33590 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
33592 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
33593 return DAG.getBitcast(VT, IntOp);
33598 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
33599 TargetLowering::DAGCombinerInfo &DCI,
33600 const X86Subtarget &Subtarget) {
33601 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
33604 if (DCI.isBeforeLegalizeOps())
33607 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
33610 if (Subtarget.hasCMov())
33611 if (SDValue RV = combineIntegerAbs(N, DAG))
33614 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33618 return combineFneg(N, DAG, Subtarget);
33623 static bool isNullFPScalarOrVectorConst(SDValue V) {
33624 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
33627 /// If a value is a scalar FP zero or a vector FP zero (potentially including
33628 /// undefined elements), return a zero constant that may be used to fold away
33629 /// that value. In the case of a vector, the returned constant will not contain
33630 /// undefined elements even if the input parameter does. This makes it suitable
33631 /// to be used as a replacement operand with operations (eg, bitwise-and) where
33632 /// an undef should not propagate.
33633 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
33634 const X86Subtarget &Subtarget) {
33635 if (!isNullFPScalarOrVectorConst(V))
33638 if (V.getValueType().isVector())
33639 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
33644 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
33645 const X86Subtarget &Subtarget) {
33646 SDValue N0 = N->getOperand(0);
33647 SDValue N1 = N->getOperand(1);
33648 EVT VT = N->getValueType(0);
33651 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
33652 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
33653 (VT == MVT::f64 && Subtarget.hasSSE2())))
33656 auto isAllOnesConstantFP = [](SDValue V) {
33657 auto *C = dyn_cast<ConstantFPSDNode>(V);
33658 return C && C->getConstantFPValue()->isAllOnesValue();
33661 // fand (fxor X, -1), Y --> fandn X, Y
33662 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
33663 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
33665 // fand X, (fxor Y, -1) --> fandn Y, X
33666 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
33667 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
33672 /// Do target-specific dag combines on X86ISD::FAND nodes.
33673 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
33674 const X86Subtarget &Subtarget) {
33675 // FAND(0.0, x) -> 0.0
33676 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
33679 // FAND(x, 0.0) -> 0.0
33680 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33683 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
33686 return lowerX86FPLogicOp(N, DAG, Subtarget);
33689 /// Do target-specific dag combines on X86ISD::FANDN nodes.
33690 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
33691 const X86Subtarget &Subtarget) {
33692 // FANDN(0.0, x) -> x
33693 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33694 return N->getOperand(1);
33696 // FANDN(x, 0.0) -> 0.0
33697 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33700 return lowerX86FPLogicOp(N, DAG, Subtarget);
33703 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
33704 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
33705 const X86Subtarget &Subtarget) {
33706 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
33708 // F[X]OR(0.0, x) -> x
33709 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33710 return N->getOperand(1);
33712 // F[X]OR(x, 0.0) -> x
33713 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
33714 return N->getOperand(0);
33717 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
33720 return lowerX86FPLogicOp(N, DAG, Subtarget);
33723 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
33724 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
33725 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
33727 // Only perform optimizations if UnsafeMath is used.
33728 if (!DAG.getTarget().Options.UnsafeFPMath)
33731 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
33732 // into FMINC and FMAXC, which are Commutative operations.
33733 unsigned NewOp = 0;
33734 switch (N->getOpcode()) {
33735 default: llvm_unreachable("unknown opcode");
33736 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
33737 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
33740 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
33741 N->getOperand(0), N->getOperand(1));
33744 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
33745 const X86Subtarget &Subtarget) {
33746 if (Subtarget.useSoftFloat())
33749 // TODO: Check for global or instruction-level "nnan". In that case, we
33750 // should be able to lower to FMAX/FMIN alone.
33751 // TODO: If an operand is already known to be a NaN or not a NaN, this
33752 // should be an optional swap and FMAX/FMIN.
33754 EVT VT = N->getValueType(0);
33755 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
33756 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
33757 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
33760 // This takes at least 3 instructions, so favor a library call when operating
33761 // on a scalar and minimizing code size.
33762 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
33765 SDValue Op0 = N->getOperand(0);
33766 SDValue Op1 = N->getOperand(1);
33768 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
33769 DAG.getDataLayout(), *DAG.getContext(), VT);
33771 // There are 4 possibilities involving NaN inputs, and these are the required
33775 // ----------------
33776 // Num | Max | Op0 |
33777 // Op0 ----------------
33778 // NaN | Op1 | NaN |
33779 // ----------------
33781 // The SSE FP max/min instructions were not designed for this case, but rather
33783 // Min = Op1 < Op0 ? Op1 : Op0
33784 // Max = Op1 > Op0 ? Op1 : Op0
33786 // So they always return Op0 if either input is a NaN. However, we can still
33787 // use those instructions for fmaxnum by selecting away a NaN input.
33789 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
33790 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
33791 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
33792 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
33794 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
33795 // are NaN, the NaN value of Op1 is the result.
33796 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
33799 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
33800 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
33801 TargetLowering::DAGCombinerInfo &DCI,
33802 const X86Subtarget &Subtarget) {
33803 // ANDNP(0, x) -> x
33804 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
33805 return N->getOperand(1);
33807 // ANDNP(x, 0) -> 0
33808 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
33809 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
33811 EVT VT = N->getValueType(0);
33813 // Attempt to recursively combine a bitmask ANDNP with shuffles.
33814 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33816 SmallVector<int, 1> NonceMask; // Just a placeholder.
33817 NonceMask.push_back(0);
33818 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
33819 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
33821 return SDValue(); // This routine will use CombineTo to replace N.
33827 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
33828 TargetLowering::DAGCombinerInfo &DCI) {
33829 // BT ignores high bits in the bit index operand.
33830 SDValue Op1 = N->getOperand(1);
33831 if (Op1.hasOneUse()) {
33832 unsigned BitWidth = Op1.getValueSizeInBits();
33833 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
33835 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
33836 !DCI.isBeforeLegalizeOps());
33837 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33838 if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
33839 TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
33840 DCI.CommitTargetLoweringOpt(TLO);
33845 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
33846 const X86Subtarget &Subtarget) {
33847 EVT VT = N->getValueType(0);
33848 if (!VT.isVector())
33851 SDValue N0 = N->getOperand(0);
33852 SDValue N1 = N->getOperand(1);
33853 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
33856 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
33857 // both SSE and AVX2 since there is no sign-extended shift right
33858 // operation on a vector with 64-bit elements.
33859 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
33860 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
33861 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
33862 N0.getOpcode() == ISD::SIGN_EXTEND)) {
33863 SDValue N00 = N0.getOperand(0);
33865 // EXTLOAD has a better solution on AVX2,
33866 // it may be replaced with X86ISD::VSEXT node.
33867 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
33868 if (!ISD::isNormalLoad(N00.getNode()))
33871 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
33872 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
33874 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
33880 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
33881 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
33882 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
33883 /// opportunities to combine math ops, use an LEA, or use a complex addressing
33884 /// mode. This can eliminate extend, add, and shift instructions.
33885 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
33886 const X86Subtarget &Subtarget) {
33887 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
33888 Ext->getOpcode() != ISD::ZERO_EXTEND)
33891 // TODO: This should be valid for other integer types.
33892 EVT VT = Ext->getValueType(0);
33893 if (VT != MVT::i64)
33896 SDValue Add = Ext->getOperand(0);
33897 if (Add.getOpcode() != ISD::ADD)
33900 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
33901 bool NSW = Add->getFlags().hasNoSignedWrap();
33902 bool NUW = Add->getFlags().hasNoUnsignedWrap();
33904 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
33906 if ((Sext && !NSW) || (!Sext && !NUW))
33909 // Having a constant operand to the 'add' ensures that we are not increasing
33910 // the instruction count because the constant is extended for free below.
33911 // A constant operand can also become the displacement field of an LEA.
33912 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
33916 // Don't make the 'add' bigger if there's no hope of combining it with some
33917 // other 'add' or 'shl' instruction.
33918 // TODO: It may be profitable to generate simpler LEA instructions in place
33919 // of single 'add' instructions, but the cost model for selecting an LEA
33920 // currently has a high threshold.
33921 bool HasLEAPotential = false;
33922 for (auto *User : Ext->uses()) {
33923 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
33924 HasLEAPotential = true;
33928 if (!HasLEAPotential)
33931 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
33932 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
33933 SDValue AddOp0 = Add.getOperand(0);
33934 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
33935 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
33937 // The wider add is guaranteed to not wrap because both operands are
33940 Flags.setNoSignedWrap(NSW);
33941 Flags.setNoUnsignedWrap(NUW);
33942 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
33945 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
33946 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
33947 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
33948 /// extends from AH (which we otherwise need to do contortions to access).
33949 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
33950 SDValue N0 = N->getOperand(0);
33951 auto OpcodeN = N->getOpcode();
33952 auto OpcodeN0 = N0.getOpcode();
33953 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
33954 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
33957 EVT VT = N->getValueType(0);
33958 EVT InVT = N0.getValueType();
33959 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
33962 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
33963 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
33964 : X86ISD::UDIVREM8_ZEXT_HREG;
33965 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
33967 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
33968 return R.getValue(1);
33971 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
33972 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
33973 /// with UNDEFs) of the input to vectors of the same size as the target type
33974 /// which then extends the lowest elements.
33975 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
33976 TargetLowering::DAGCombinerInfo &DCI,
33977 const X86Subtarget &Subtarget) {
33978 unsigned Opcode = N->getOpcode();
33979 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
33981 if (!DCI.isBeforeLegalizeOps())
33983 if (!Subtarget.hasSSE2())
33986 SDValue N0 = N->getOperand(0);
33987 EVT VT = N->getValueType(0);
33988 EVT SVT = VT.getScalarType();
33989 EVT InVT = N0.getValueType();
33990 EVT InSVT = InVT.getScalarType();
33992 // Input type must be a vector and we must be extending legal integer types.
33993 if (!VT.isVector())
33995 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
33997 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
34000 // On AVX2+ targets, if the input/output types are both legal then we will be
34001 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
34002 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
34003 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
34008 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
34009 EVT InVT = N.getValueType();
34010 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
34011 Size / InVT.getScalarSizeInBits());
34012 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
34013 DAG.getUNDEF(InVT));
34015 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
34018 // If target-size is less than 128-bits, extend to a type that would extend
34019 // to 128 bits, extend that and extract the original target vector.
34020 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
34021 unsigned Scale = 128 / VT.getSizeInBits();
34023 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
34024 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
34025 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
34026 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
34027 DAG.getIntPtrConstant(0, DL));
34030 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
34031 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
34032 // Also use this if we don't have SSE41 to allow the legalizer do its job.
34033 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
34034 (VT.is256BitVector() && Subtarget.hasInt256()) ||
34035 (VT.is512BitVector() && Subtarget.hasAVX512())) {
34036 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
34037 return Opcode == ISD::SIGN_EXTEND
34038 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
34039 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
34042 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
34043 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
34044 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
34045 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
34046 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
34048 SmallVector<SDValue, 8> Opnds;
34049 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
34050 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
34051 DAG.getIntPtrConstant(Offset, DL));
34052 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
34053 SrcVec = Opcode == ISD::SIGN_EXTEND
34054 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
34055 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
34056 Opnds.push_back(SrcVec);
34058 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
34061 // On pre-AVX2 targets, split into 128-bit nodes of
34062 // ISD::*_EXTEND_VECTOR_INREG.
34063 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
34064 return SplitAndExtendInReg(128);
34066 // On pre-AVX512 targets, split into 256-bit nodes of
34067 // ISD::*_EXTEND_VECTOR_INREG.
34068 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
34069 return SplitAndExtendInReg(256);
34074 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
34075 TargetLowering::DAGCombinerInfo &DCI,
34076 const X86Subtarget &Subtarget) {
34077 SDValue N0 = N->getOperand(0);
34078 EVT VT = N->getValueType(0);
34079 EVT InVT = N0.getValueType();
34082 if (SDValue DivRem8 = getDivRem8(N, DAG))
34085 if (!DCI.isBeforeLegalizeOps()) {
34086 if (InVT == MVT::i1) {
34087 SDValue Zero = DAG.getConstant(0, DL, VT);
34088 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34089 return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
34094 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
34095 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
34096 // Invert and sign-extend a boolean is the same as zero-extend and subtract
34097 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34098 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34099 // sext (xor Bool, -1) --> sub (zext Bool), 1
34100 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34101 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34104 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34107 if (Subtarget.hasAVX() && VT.is256BitVector())
34108 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34111 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34117 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34118 const X86Subtarget &Subtarget) {
34120 EVT VT = N->getValueType(0);
34122 // Let legalize expand this if it isn't a legal type yet.
34123 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34126 EVT ScalarVT = VT.getScalarType();
34127 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
34130 SDValue A = N->getOperand(0);
34131 SDValue B = N->getOperand(1);
34132 SDValue C = N->getOperand(2);
34134 auto invertIfNegative = [](SDValue &V) {
34135 if (SDValue NegVal = isFNEG(V.getNode())) {
34142 // Do not convert the passthru input of scalar intrinsics.
34143 // FIXME: We could allow negations of the lower element only.
34144 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34145 bool NegB = invertIfNegative(B);
34146 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34148 // Negative multiplication when NegA xor NegB
34149 bool NegMul = (NegA != NegB);
34151 unsigned NewOpcode;
34153 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34155 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34158 if (N->getOpcode() == X86ISD::FMADD_RND) {
34159 switch (NewOpcode) {
34160 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
34161 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
34162 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34163 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34165 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34166 switch (NewOpcode) {
34167 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
34168 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
34169 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34170 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34172 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34173 switch (NewOpcode) {
34174 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
34175 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
34176 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34177 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34180 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
34181 "Unexpected opcode!");
34182 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34185 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34188 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34189 TargetLowering::DAGCombinerInfo &DCI,
34190 const X86Subtarget &Subtarget) {
34191 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
34192 // (and (i32 x86isd::setcc_carry), 1)
34193 // This eliminates the zext. This transformation is necessary because
34194 // ISD::SETCC is always legalized to i8.
34196 SDValue N0 = N->getOperand(0);
34197 EVT VT = N->getValueType(0);
34199 if (N0.getOpcode() == ISD::AND &&
34201 N0.getOperand(0).hasOneUse()) {
34202 SDValue N00 = N0.getOperand(0);
34203 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34204 if (!isOneConstant(N0.getOperand(1)))
34206 return DAG.getNode(ISD::AND, dl, VT,
34207 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34208 N00.getOperand(0), N00.getOperand(1)),
34209 DAG.getConstant(1, dl, VT));
34213 if (N0.getOpcode() == ISD::TRUNCATE &&
34215 N0.getOperand(0).hasOneUse()) {
34216 SDValue N00 = N0.getOperand(0);
34217 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34218 return DAG.getNode(ISD::AND, dl, VT,
34219 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34220 N00.getOperand(0), N00.getOperand(1)),
34221 DAG.getConstant(1, dl, VT));
34225 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34228 if (VT.is256BitVector())
34229 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34232 if (SDValue DivRem8 = getDivRem8(N, DAG))
34235 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34238 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34244 /// Try to map a 128-bit or larger integer comparison to vector instructions
34245 /// before type legalization splits it up into chunks.
34246 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34247 const X86Subtarget &Subtarget) {
34248 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34249 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34251 // We're looking for an oversized integer equality comparison, but ignore a
34252 // comparison with zero because that gets special treatment in EmitTest().
34253 SDValue X = SetCC->getOperand(0);
34254 SDValue Y = SetCC->getOperand(1);
34255 EVT OpVT = X.getValueType();
34256 unsigned OpSize = OpVT.getSizeInBits();
34257 if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
34260 // TODO: Use PXOR + PTEST for SSE4.1 or later?
34261 // TODO: Add support for AVX-512.
34262 EVT VT = SetCC->getValueType(0);
34264 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
34265 (OpSize == 256 && Subtarget.hasAVX2())) {
34266 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34267 SDValue VecX = DAG.getBitcast(VecVT, X);
34268 SDValue VecY = DAG.getBitcast(VecVT, Y);
34270 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34271 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34272 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34273 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34274 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34275 SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34276 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34277 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34279 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34285 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34286 const X86Subtarget &Subtarget) {
34287 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34288 SDValue LHS = N->getOperand(0);
34289 SDValue RHS = N->getOperand(1);
34290 EVT VT = N->getValueType(0);
34293 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
34294 EVT OpVT = LHS.getValueType();
34295 // 0-x == y --> x+y == 0
34296 // 0-x != y --> x+y != 0
34297 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34299 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34300 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34302 // x == 0-y --> x+y == 0
34303 // x != 0-y --> x+y != 0
34304 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34306 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34307 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34310 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34314 if (VT.getScalarType() == MVT::i1 &&
34315 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
34317 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34318 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34319 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34321 if (!IsSEXT0 || !IsVZero1) {
34322 // Swap the operands and update the condition code.
34323 std::swap(LHS, RHS);
34324 CC = ISD::getSetCCSwappedOperands(CC);
34326 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34327 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34328 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34331 if (IsSEXT0 && IsVZero1) {
34332 assert(VT == LHS.getOperand(0).getValueType() &&
34333 "Uexpected operand type");
34334 if (CC == ISD::SETGT)
34335 return DAG.getConstant(0, DL, VT);
34336 if (CC == ISD::SETLE)
34337 return DAG.getConstant(1, DL, VT);
34338 if (CC == ISD::SETEQ || CC == ISD::SETGE)
34339 return DAG.getNOT(DL, LHS.getOperand(0), VT);
34341 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
34342 "Unexpected condition code!");
34343 return LHS.getOperand(0);
34347 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34348 // to avoid scalarization via legalization because v4i32 is not a legal type.
34349 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34350 LHS.getValueType() == MVT::v4f32)
34351 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34356 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34358 // Gather and Scatter instructions use k-registers for masks. The type of
34359 // the masks is v*i1. So the mask will be truncated anyway.
34360 // The SIGN_EXTEND_INREG my be dropped.
34361 SDValue Mask = N->getOperand(2);
34362 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34363 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34364 NewOps[2] = Mask.getOperand(0);
34365 DAG.UpdateNodeOperands(N, NewOps);
34370 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34371 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34372 const X86Subtarget &Subtarget) {
34374 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34375 SDValue EFLAGS = N->getOperand(1);
34377 // Try to simplify the EFLAGS and condition code operands.
34378 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34379 return getSETCC(CC, Flags, DL, DAG);
34384 /// Optimize branch condition evaluation.
34385 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34386 const X86Subtarget &Subtarget) {
34388 SDValue EFLAGS = N->getOperand(3);
34389 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34391 // Try to simplify the EFLAGS and condition code operands.
34392 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
34393 // RAUW them under us.
34394 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34395 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34396 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34397 N->getOperand(1), Cond, Flags);
34403 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34404 SelectionDAG &DAG) {
34405 // Take advantage of vector comparisons producing 0 or -1 in each lane to
34406 // optimize away operation when it's from a constant.
34408 // The general transformation is:
34409 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34410 // AND(VECTOR_CMP(x,y), constant2)
34411 // constant2 = UNARYOP(constant)
34413 // Early exit if this isn't a vector operation, the operand of the
34414 // unary operation isn't a bitwise AND, or if the sizes of the operations
34415 // aren't the same.
34416 EVT VT = N->getValueType(0);
34417 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
34418 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
34419 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34422 // Now check that the other operand of the AND is a constant. We could
34423 // make the transformation for non-constant splats as well, but it's unclear
34424 // that would be a benefit as it would not eliminate any operations, just
34425 // perform one more step in scalar code before moving to the vector unit.
34426 if (BuildVectorSDNode *BV =
34427 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34428 // Bail out if the vector isn't a constant.
34429 if (!BV->isConstant())
34432 // Everything checks out. Build up the new and improved node.
34434 EVT IntVT = BV->getValueType(0);
34435 // Create a new constant of the appropriate type for the transformed
34437 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34438 // The AND node needs bitcasts to/from an integer vector type around it.
34439 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34440 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34441 N->getOperand(0)->getOperand(0), MaskConst);
34442 SDValue Res = DAG.getBitcast(VT, NewAnd);
34449 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34450 const X86Subtarget &Subtarget) {
34451 SDValue Op0 = N->getOperand(0);
34452 EVT VT = N->getValueType(0);
34453 EVT InVT = Op0.getValueType();
34454 EVT InSVT = InVT.getScalarType();
34455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34457 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34458 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34459 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
34461 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34462 InVT.getVectorNumElements());
34463 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34465 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34466 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34468 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34471 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34472 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34473 // the optimization here.
34474 if (DAG.SignBitIsZero(Op0))
34475 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34480 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34481 const X86Subtarget &Subtarget) {
34482 // First try to optimize away the conversion entirely when it's
34483 // conditionally from a constant. Vectors only.
34484 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34487 // Now move on to more general possibilities.
34488 SDValue Op0 = N->getOperand(0);
34489 EVT VT = N->getValueType(0);
34490 EVT InVT = Op0.getValueType();
34491 EVT InSVT = InVT.getScalarType();
34493 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34494 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34495 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34496 if (InVT.isVector() &&
34497 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
34498 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34500 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34501 InVT.getVectorNumElements());
34502 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34503 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34506 // Without AVX512DQ we only support i64 to float scalar conversion. For both
34507 // vectors and scalars, see if we know that the upper bits are all the sign
34508 // bit, in which case we can truncate the input to i32 and convert from that.
34509 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
34510 unsigned BitWidth = InVT.getScalarSizeInBits();
34511 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
34512 if (NumSignBits >= (BitWidth - 31)) {
34513 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
34514 if (InVT.isVector())
34515 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
34516 InVT.getVectorNumElements());
34518 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
34519 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
34523 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
34524 // a 32-bit target where SSE doesn't support i64->FP operations.
34525 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
34526 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
34527 EVT LdVT = Ld->getValueType(0);
34529 // This transformation is not supported if the result type is f16 or f128.
34530 if (VT == MVT::f16 || VT == MVT::f128)
34533 if (!Ld->isVolatile() && !VT.isVector() &&
34534 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
34535 !Subtarget.is64Bit() && LdVT == MVT::i64) {
34536 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
34537 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
34538 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
34545 // Optimize RES, EFLAGS = X86ISD::ADD LHS, RHS
34546 static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG,
34547 X86TargetLowering::DAGCombinerInfo &DCI) {
34548 // When legalizing carry, we create carries via add X, -1
34549 // If that comes from an actual carry, via setcc, we use the
34551 if (isAllOnesConstant(N->getOperand(1)) && N->hasAnyUseOfValue(1)) {
34552 SDValue Carry = N->getOperand(0);
34553 while (Carry.getOpcode() == ISD::TRUNCATE ||
34554 Carry.getOpcode() == ISD::ZERO_EXTEND ||
34555 Carry.getOpcode() == ISD::SIGN_EXTEND ||
34556 Carry.getOpcode() == ISD::ANY_EXTEND ||
34557 (Carry.getOpcode() == ISD::AND &&
34558 isOneConstant(Carry.getOperand(1))))
34559 Carry = Carry.getOperand(0);
34561 if (Carry.getOpcode() == ISD::SETCC ||
34562 Carry.getOpcode() == X86ISD::SETCC ||
34563 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
34564 if (Carry.getConstantOperandVal(0) == X86::COND_B)
34565 return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1));
34572 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
34573 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
34574 X86TargetLowering::DAGCombinerInfo &DCI) {
34575 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
34576 // the result is either zero or one (depending on the input carry bit).
34577 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
34578 if (X86::isZeroNode(N->getOperand(0)) &&
34579 X86::isZeroNode(N->getOperand(1)) &&
34580 // We don't have a good way to replace an EFLAGS use, so only do this when
34582 SDValue(N, 1).use_empty()) {
34584 EVT VT = N->getValueType(0);
34585 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
34586 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
34587 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
34588 DAG.getConstant(X86::COND_B, DL,
34591 DAG.getConstant(1, DL, VT));
34592 return DCI.CombineTo(N, Res1, CarryOut);
34598 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
34599 /// which is more useful than 0/1 in some cases.
34600 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
34602 // "Condition code B" is also known as "the carry flag" (CF).
34603 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
34604 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
34605 MVT VT = N->getSimpleValueType(0);
34607 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
34609 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
34610 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
34613 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
34614 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
34615 /// with CMP+{ADC, SBB}.
34616 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
34617 bool IsSub = N->getOpcode() == ISD::SUB;
34618 SDValue X = N->getOperand(0);
34619 SDValue Y = N->getOperand(1);
34621 // If this is an add, canonicalize a zext operand to the RHS.
34622 // TODO: Incomplete? What if both sides are zexts?
34623 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
34624 Y.getOpcode() != ISD::ZERO_EXTEND)
34627 // Look through a one-use zext.
34628 bool PeekedThroughZext = false;
34629 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
34630 Y = Y.getOperand(0);
34631 PeekedThroughZext = true;
34634 // If this is an add, canonicalize a setcc operand to the RHS.
34635 // TODO: Incomplete? What if both sides are setcc?
34636 // TODO: Should we allow peeking through a zext of the other operand?
34637 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
34638 Y.getOpcode() != X86ISD::SETCC)
34641 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
34645 EVT VT = N->getValueType(0);
34646 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
34648 if (CC == X86::COND_B) {
34649 // X + SETB Z --> X + (mask SBB Z, Z)
34650 // X - SETB Z --> X - (mask SBB Z, Z)
34651 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
34652 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
34653 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34654 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34655 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34658 if (CC == X86::COND_A) {
34659 SDValue EFLAGS = Y->getOperand(1);
34660 // Try to convert COND_A into COND_B in an attempt to facilitate
34661 // materializing "setb reg".
34663 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
34664 // cannot take an immediate as its first operand.
34666 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
34667 EFLAGS.getValueType().isInteger() &&
34668 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
34669 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
34670 EFLAGS.getNode()->getVTList(),
34671 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
34672 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
34673 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
34674 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34675 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34676 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34680 if (CC != X86::COND_E && CC != X86::COND_NE)
34683 SDValue Cmp = Y.getOperand(1);
34684 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
34685 !X86::isZeroNode(Cmp.getOperand(1)) ||
34686 !Cmp.getOperand(0).getValueType().isInteger())
34689 // (cmp Z, 1) sets the carry flag if Z is 0.
34690 SDValue Z = Cmp.getOperand(0);
34691 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
34692 DAG.getConstant(1, DL, Z.getValueType()));
34694 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
34696 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
34697 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
34698 if (CC == X86::COND_NE)
34699 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
34700 DAG.getConstant(-1ULL, DL, VT), NewCmp);
34702 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
34703 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
34704 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
34705 DAG.getConstant(0, DL, VT), NewCmp);
34708 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
34709 const X86Subtarget &Subtarget) {
34710 SDValue MulOp = N->getOperand(0);
34711 SDValue Phi = N->getOperand(1);
34713 if (MulOp.getOpcode() != ISD::MUL)
34714 std::swap(MulOp, Phi);
34715 if (MulOp.getOpcode() != ISD::MUL)
34719 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
34722 EVT VT = N->getValueType(0);
34724 unsigned RegSize = 128;
34725 if (Subtarget.hasBWI())
34727 else if (Subtarget.hasAVX2())
34729 unsigned VectorSize = VT.getVectorNumElements() * 16;
34730 // If the vector size is less than 128, or greater than the supported RegSize,
34731 // do not use PMADD.
34732 if (VectorSize < 128 || VectorSize > RegSize)
34736 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
34737 VT.getVectorNumElements());
34738 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34739 VT.getVectorNumElements() / 2);
34741 // Shrink the operands of mul.
34742 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
34743 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
34745 // Madd vector size is half of the original vector size
34746 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
34747 // Fill the rest of the output with 0
34748 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
34749 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
34750 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
34753 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
34754 const X86Subtarget &Subtarget) {
34756 EVT VT = N->getValueType(0);
34757 SDValue Op0 = N->getOperand(0);
34758 SDValue Op1 = N->getOperand(1);
34760 // TODO: There's nothing special about i32, any integer type above i16 should
34761 // work just as well.
34762 if (!VT.isVector() || !VT.isSimple() ||
34763 !(VT.getVectorElementType() == MVT::i32))
34766 unsigned RegSize = 128;
34767 if (Subtarget.hasBWI())
34769 else if (Subtarget.hasAVX2())
34772 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
34773 // TODO: We should be able to handle larger vectors by splitting them before
34774 // feeding them into several SADs, and then reducing over those.
34775 if (VT.getSizeInBits() / 4 > RegSize)
34778 // We know N is a reduction add, which means one of its operands is a phi.
34779 // To match SAD, we need the other operand to be a vector select.
34780 SDValue SelectOp, Phi;
34781 if (Op0.getOpcode() == ISD::VSELECT) {
34784 } else if (Op1.getOpcode() == ISD::VSELECT) {
34790 // Check whether we have an abs-diff pattern feeding into the select.
34791 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
34794 // SAD pattern detected. Now build a SAD instruction and an addition for
34795 // reduction. Note that the number of elements of the result of SAD is less
34796 // than the number of elements of its input. Therefore, we could only update
34797 // part of elements in the reduction vector.
34798 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
34800 // The output of PSADBW is a vector of i64.
34801 // We need to turn the vector of i64 into a vector of i32.
34802 // If the reduction vector is at least as wide as the psadbw result, just
34803 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
34805 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
34806 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
34807 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
34809 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
34811 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
34812 // Update part of elements of the reduction vector. This is done by first
34813 // extracting a sub-vector from it, updating this sub-vector, and inserting
34815 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
34816 DAG.getIntPtrConstant(0, DL));
34817 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
34818 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
34819 DAG.getIntPtrConstant(0, DL));
34821 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
34824 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
34825 const X86Subtarget &Subtarget) {
34826 const SDNodeFlags Flags = N->getFlags();
34827 if (Flags.hasVectorReduction()) {
34828 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
34830 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
34833 EVT VT = N->getValueType(0);
34834 SDValue Op0 = N->getOperand(0);
34835 SDValue Op1 = N->getOperand(1);
34837 // Try to synthesize horizontal adds from adds of shuffles.
34838 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34839 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34840 isHorizontalBinOp(Op0, Op1, true))
34841 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
34843 return combineAddOrSubToADCOrSBB(N, DAG);
34846 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
34847 const X86Subtarget &Subtarget) {
34848 SDValue Op0 = N->getOperand(0);
34849 SDValue Op1 = N->getOperand(1);
34851 // X86 can't encode an immediate LHS of a sub. See if we can push the
34852 // negation into a preceding instruction.
34853 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
34854 // If the RHS of the sub is a XOR with one use and a constant, invert the
34855 // immediate. Then add one to the LHS of the sub so we can turn
34856 // X-Y -> X+~Y+1, saving one register.
34857 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
34858 isa<ConstantSDNode>(Op1.getOperand(1))) {
34859 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
34860 EVT VT = Op0.getValueType();
34861 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
34863 DAG.getConstant(~XorC, SDLoc(Op1), VT));
34864 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
34865 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
34869 // Try to synthesize horizontal subs from subs of shuffles.
34870 EVT VT = N->getValueType(0);
34871 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34872 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34873 isHorizontalBinOp(Op0, Op1, false))
34874 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
34876 return combineAddOrSubToADCOrSBB(N, DAG);
34879 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
34880 TargetLowering::DAGCombinerInfo &DCI,
34881 const X86Subtarget &Subtarget) {
34882 if (DCI.isBeforeLegalize())
34886 unsigned Opcode = N->getOpcode();
34887 MVT VT = N->getSimpleValueType(0);
34888 MVT SVT = VT.getVectorElementType();
34889 unsigned NumElts = VT.getVectorNumElements();
34890 unsigned EltSizeInBits = SVT.getSizeInBits();
34892 SDValue Op = N->getOperand(0);
34893 MVT OpVT = Op.getSimpleValueType();
34894 MVT OpEltVT = OpVT.getVectorElementType();
34895 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
34896 unsigned InputBits = OpEltSizeInBits * NumElts;
34898 // Perform any constant folding.
34899 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
34901 SmallVector<APInt, 64> EltBits;
34902 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
34903 APInt Undefs(NumElts, 0);
34904 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
34906 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
34907 for (unsigned i = 0; i != NumElts; ++i) {
34908 if (UndefElts[i]) {
34912 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
34913 : EltBits[i].sextOrTrunc(EltSizeInBits);
34915 return getConstVector(Vals, Undefs, VT, DAG, DL);
34918 // (vzext (bitcast (vzext (x)) -> (vzext x)
34919 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
34920 SDValue V = peekThroughBitcasts(Op);
34921 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
34922 MVT InnerVT = V.getSimpleValueType();
34923 MVT InnerEltVT = InnerVT.getVectorElementType();
34925 // If the element sizes match exactly, we can just do one larger vzext. This
34926 // is always an exact type match as vzext operates on integer types.
34927 if (OpEltVT == InnerEltVT) {
34928 assert(OpVT == InnerVT && "Types must match for vzext!");
34929 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
34932 // The only other way we can combine them is if only a single element of the
34933 // inner vzext is used in the input to the outer vzext.
34934 if (InnerEltVT.getSizeInBits() < InputBits)
34937 // In this case, the inner vzext is completely dead because we're going to
34938 // only look at bits inside of the low element. Just do the outer vzext on
34939 // a bitcast of the input to the inner.
34940 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
34943 // Check if we can bypass extracting and re-inserting an element of an input
34944 // vector. Essentially:
34945 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
34946 // TODO: Add X86ISD::VSEXT support
34947 if (Opcode == X86ISD::VZEXT &&
34948 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
34949 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34950 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
34951 SDValue ExtractedV = V.getOperand(0);
34952 SDValue OrigV = ExtractedV.getOperand(0);
34953 if (isNullConstant(ExtractedV.getOperand(1))) {
34954 MVT OrigVT = OrigV.getSimpleValueType();
34955 // Extract a subvector if necessary...
34956 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
34957 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
34958 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
34959 OrigVT.getVectorNumElements() / Ratio);
34960 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
34961 DAG.getIntPtrConstant(0, DL));
34963 Op = DAG.getBitcast(OpVT, OrigV);
34964 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
34971 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
34972 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
34973 const X86Subtarget &Subtarget) {
34974 SDValue Chain = N->getOperand(0);
34975 SDValue LHS = N->getOperand(1);
34976 SDValue RHS = N->getOperand(2);
34977 MVT VT = RHS.getSimpleValueType();
34980 auto *C = dyn_cast<ConstantSDNode>(RHS);
34981 if (!C || C->getZExtValue() != 1)
34984 RHS = DAG.getConstant(-1, DL, VT);
34985 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
34986 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
34987 DAG.getVTList(MVT::i32, MVT::Other),
34988 {Chain, LHS, RHS}, VT, MMO);
34991 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
34992 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
34993 SDValue Op0 = N->getOperand(0);
34994 SDValue Op1 = N->getOperand(1);
34996 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
34999 EVT VT = N->getValueType(0);
35002 return DAG.getNode(X86ISD::TESTM, DL, VT,
35003 Op0->getOperand(0), Op0->getOperand(1));
35006 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
35007 const X86Subtarget &Subtarget) {
35008 MVT VT = N->getSimpleValueType(0);
35011 if (N->getOperand(0) == N->getOperand(1)) {
35012 if (N->getOpcode() == X86ISD::PCMPEQ)
35013 return getOnesVector(VT, DAG, DL);
35014 if (N->getOpcode() == X86ISD::PCMPGT)
35015 return getZeroVector(VT, Subtarget, DAG, DL);
35021 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
35022 TargetLowering::DAGCombinerInfo &DCI,
35023 const X86Subtarget &Subtarget) {
35024 if (DCI.isBeforeLegalizeOps())
35028 SDValue Vec = N->getOperand(0);
35029 SDValue SubVec = N->getOperand(1);
35030 SDValue Idx = N->getOperand(2);
35032 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
35033 MVT OpVT = N->getSimpleValueType(0);
35034 MVT SubVecVT = SubVec.getSimpleValueType();
35036 // If this is an insert of an extract, combine to a shuffle. Don't do this
35037 // if the insert or extract can be represented with a subvector operation.
35038 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
35039 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
35040 (IdxVal != 0 || !Vec.isUndef())) {
35041 int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
35042 if (ExtIdxVal != 0) {
35043 int VecNumElts = OpVT.getVectorNumElements();
35044 int SubVecNumElts = SubVecVT.getVectorNumElements();
35045 SmallVector<int, 64> Mask(VecNumElts);
35046 // First create an identity shuffle mask.
35047 for (int i = 0; i != VecNumElts; ++i)
35049 // Now insert the extracted portion.
35050 for (int i = 0; i != SubVecNumElts; ++i)
35051 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
35053 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
35057 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
35059 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35060 // (load16 addr + 16), Elts/2)
35063 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35064 // (load32 addr + 32), Elts/2)
35066 // or a 16-byte or 32-byte broadcast:
35067 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35068 // (load16 addr), Elts/2)
35069 // --> X86SubVBroadcast(load16 addr)
35071 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35072 // (load32 addr), Elts/2)
35073 // --> X86SubVBroadcast(load32 addr)
35074 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
35075 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
35076 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
35077 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
35078 if (Idx2 && Idx2->getZExtValue() == 0) {
35079 SDValue SubVec2 = Vec.getOperand(1);
35080 // If needed, look through bitcasts to get to the load.
35081 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
35083 unsigned Alignment = FirstLd->getAlignment();
35084 unsigned AS = FirstLd->getAddressSpace();
35085 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35086 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35087 OpVT, AS, Alignment, &Fast) && Fast) {
35088 SDValue Ops[] = {SubVec2, SubVec};
35089 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
35093 // If lower/upper loads are the same and the only users of the load, then
35094 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
35095 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
35096 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
35097 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
35098 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35101 // If this is subv_broadcast insert into both halves, use a larger
35103 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
35104 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35105 SubVec.getOperand(0));
35114 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35115 DAGCombinerInfo &DCI) const {
35116 SelectionDAG &DAG = DCI.DAG;
35117 switch (N->getOpcode()) {
35119 case ISD::EXTRACT_VECTOR_ELT:
35120 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35121 case X86ISD::PEXTRW:
35122 case X86ISD::PEXTRB:
35123 return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35124 case ISD::INSERT_SUBVECTOR:
35125 return combineInsertSubvector(N, DAG, DCI, Subtarget);
35128 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35129 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
35130 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
35131 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
35132 case ISD::SUB: return combineSub(N, DAG, Subtarget);
35133 case X86ISD::ADD: return combineX86ADD(N, DAG, DCI);
35134 case X86ISD::ADC: return combineADC(N, DAG, DCI);
35135 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
35138 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
35139 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
35140 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
35141 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
35142 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
35143 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
35144 case ISD::STORE: return combineStore(N, DAG, Subtarget);
35145 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
35146 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
35147 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
35149 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
35150 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
35151 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
35152 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
35153 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
35154 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
35156 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
35158 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
35160 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
35161 case X86ISD::BT: return combineBT(N, DAG, DCI);
35162 case ISD::ANY_EXTEND:
35163 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
35164 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
35165 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35166 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
35167 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
35168 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
35169 case X86ISD::VSHLI:
35170 case X86ISD::VSRAI:
35171 case X86ISD::VSRLI:
35172 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35173 case ISD::SIGN_EXTEND_VECTOR_INREG:
35174 case ISD::ZERO_EXTEND_VECTOR_INREG:
35175 case X86ISD::VSEXT:
35176 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
35177 case X86ISD::PINSRB:
35178 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
35179 case X86ISD::SHUFP: // Handle all target specific shuffles
35180 case X86ISD::INSERTPS:
35181 case X86ISD::PALIGNR:
35182 case X86ISD::VSHLDQ:
35183 case X86ISD::VSRLDQ:
35184 case X86ISD::BLENDI:
35185 case X86ISD::UNPCKH:
35186 case X86ISD::UNPCKL:
35187 case X86ISD::MOVHLPS:
35188 case X86ISD::MOVLHPS:
35189 case X86ISD::PSHUFB:
35190 case X86ISD::PSHUFD:
35191 case X86ISD::PSHUFHW:
35192 case X86ISD::PSHUFLW:
35193 case X86ISD::MOVSHDUP:
35194 case X86ISD::MOVSLDUP:
35195 case X86ISD::MOVDDUP:
35196 case X86ISD::MOVSS:
35197 case X86ISD::MOVSD:
35198 case X86ISD::VPPERM:
35199 case X86ISD::VPERMI:
35200 case X86ISD::VPERMV:
35201 case X86ISD::VPERMV3:
35202 case X86ISD::VPERMIV3:
35203 case X86ISD::VPERMIL2:
35204 case X86ISD::VPERMILPI:
35205 case X86ISD::VPERMILPV:
35206 case X86ISD::VPERM2X128:
35207 case X86ISD::VZEXT_MOVL:
35208 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35209 case X86ISD::FMADD:
35210 case X86ISD::FMADD_RND:
35211 case X86ISD::FMADDS1_RND:
35212 case X86ISD::FMADDS3_RND:
35213 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
35215 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
35216 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
35217 case X86ISD::TESTM: return combineTestM(N, DAG);
35218 case X86ISD::PCMPEQ:
35219 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
35225 /// Return true if the target has native support for the specified value type
35226 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
35227 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
35228 /// some i16 instructions are slow.
35229 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35230 if (!isTypeLegal(VT))
35232 if (VT != MVT::i16)
35239 case ISD::SIGN_EXTEND:
35240 case ISD::ZERO_EXTEND:
35241 case ISD::ANY_EXTEND:
35254 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35255 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35256 /// we don't adjust the stack we clobber the first frame index.
35257 /// See X86InstrInfo::copyPhysReg.
35258 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
35259 const MachineRegisterInfo &MRI = MF.getRegInfo();
35260 return any_of(MRI.reg_instructions(X86::EFLAGS),
35261 [](const MachineInstr &RI) { return RI.isCopy(); });
35264 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
35265 if (hasCopyImplyingStackAdjustment(MF)) {
35266 MachineFrameInfo &MFI = MF.getFrameInfo();
35267 MFI.setHasCopyImplyingStackAdjustment(true);
35270 TargetLoweringBase::finalizeLowering(MF);
35273 /// This method query the target whether it is beneficial for dag combiner to
35274 /// promote the specified node. If true, it should return the desired promotion
35275 /// type by reference.
35276 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35277 EVT VT = Op.getValueType();
35278 if (VT != MVT::i16)
35281 bool Promote = false;
35282 bool Commute = false;
35283 switch (Op.getOpcode()) {
35285 case ISD::SIGN_EXTEND:
35286 case ISD::ZERO_EXTEND:
35287 case ISD::ANY_EXTEND:
35292 SDValue N0 = Op.getOperand(0);
35293 // Look out for (store (shl (load), x)).
35294 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35307 SDValue N0 = Op.getOperand(0);
35308 SDValue N1 = Op.getOperand(1);
35309 if (!Commute && MayFoldLoad(N1))
35311 // Avoid disabling potential load folding opportunities.
35312 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
35314 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
35324 //===----------------------------------------------------------------------===//
35325 // X86 Inline Assembly Support
35326 //===----------------------------------------------------------------------===//
35328 // Helper to match a string separated by whitespace.
35329 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35330 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35332 for (StringRef Piece : Pieces) {
35333 if (!S.startswith(Piece)) // Check if the piece matches.
35336 S = S.substr(Piece.size());
35337 StringRef::size_type Pos = S.find_first_not_of(" \t");
35338 if (Pos == 0) // We matched a prefix.
35347 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35349 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
35350 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35351 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35352 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35354 if (AsmPieces.size() == 3)
35356 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35363 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35364 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35366 const std::string &AsmStr = IA->getAsmString();
35368 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35369 if (!Ty || Ty->getBitWidth() % 16 != 0)
35372 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
35373 SmallVector<StringRef, 4> AsmPieces;
35374 SplitString(AsmStr, AsmPieces, ";\n");
35376 switch (AsmPieces.size()) {
35377 default: return false;
35379 // FIXME: this should verify that we are targeting a 486 or better. If not,
35380 // we will turn this bswap into something that will be lowered to logical
35381 // ops instead of emitting the bswap asm. For now, we don't support 486 or
35382 // lower so don't worry about this.
35384 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
35385 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
35386 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
35387 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
35388 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
35389 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35390 // No need to check constraints, nothing other than the equivalent of
35391 // "=r,0" would be valid here.
35392 return IntrinsicLowering::LowerToByteSwap(CI);
35395 // rorw $$8, ${0:w} --> llvm.bswap.i16
35396 if (CI->getType()->isIntegerTy(16) &&
35397 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35398 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
35399 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35401 StringRef ConstraintsStr = IA->getConstraintString();
35402 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35403 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35404 if (clobbersFlagRegisters(AsmPieces))
35405 return IntrinsicLowering::LowerToByteSwap(CI);
35409 if (CI->getType()->isIntegerTy(32) &&
35410 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35411 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
35412 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
35413 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
35415 StringRef ConstraintsStr = IA->getConstraintString();
35416 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35417 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35418 if (clobbersFlagRegisters(AsmPieces))
35419 return IntrinsicLowering::LowerToByteSwap(CI);
35422 if (CI->getType()->isIntegerTy(64)) {
35423 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
35424 if (Constraints.size() >= 2 &&
35425 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
35426 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
35427 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
35428 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
35429 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
35430 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
35431 return IntrinsicLowering::LowerToByteSwap(CI);
35439 /// Given a constraint letter, return the type of constraint for this target.
35440 X86TargetLowering::ConstraintType
35441 X86TargetLowering::getConstraintType(StringRef Constraint) const {
35442 if (Constraint.size() == 1) {
35443 switch (Constraint[0]) {
35455 return C_RegisterClass;
35456 case 'k': // AVX512 masking registers.
35480 else if (Constraint.size() == 2) {
35481 switch (Constraint[0]) {
35485 switch (Constraint[1]) {
35493 return TargetLowering::getConstraintType(Constraint);
35496 /// Examine constraint type and operand type and determine a weight value.
35497 /// This object must already have been set up with the operand type
35498 /// and the current alternative constraint selected.
35499 TargetLowering::ConstraintWeight
35500 X86TargetLowering::getSingleConstraintMatchWeight(
35501 AsmOperandInfo &info, const char *constraint) const {
35502 ConstraintWeight weight = CW_Invalid;
35503 Value *CallOperandVal = info.CallOperandVal;
35504 // If we don't have a value, we can't do a match,
35505 // but allow it at the lowest weight.
35506 if (!CallOperandVal)
35508 Type *type = CallOperandVal->getType();
35509 // Look at the constraint type.
35510 switch (*constraint) {
35512 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
35523 if (CallOperandVal->getType()->isIntegerTy())
35524 weight = CW_SpecificReg;
35529 if (type->isFloatingPointTy())
35530 weight = CW_SpecificReg;
35533 if (type->isX86_MMXTy() && Subtarget.hasMMX())
35534 weight = CW_SpecificReg;
35537 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
35538 if (constraint[1] == 'k') {
35539 // Support for 'Yk' (similarly to the 'k' variant below).
35540 weight = CW_SpecificReg;
35543 // Else fall through (handle "Y" constraint).
35546 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
35547 weight = CW_Register;
35550 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
35551 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
35552 weight = CW_Register;
35555 // Enable conditional vector operations using %k<#> registers.
35556 weight = CW_SpecificReg;
35559 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
35560 if (C->getZExtValue() <= 31)
35561 weight = CW_Constant;
35565 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35566 if (C->getZExtValue() <= 63)
35567 weight = CW_Constant;
35571 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35572 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
35573 weight = CW_Constant;
35577 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35578 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
35579 weight = CW_Constant;
35583 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35584 if (C->getZExtValue() <= 3)
35585 weight = CW_Constant;
35589 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35590 if (C->getZExtValue() <= 0xff)
35591 weight = CW_Constant;
35596 if (isa<ConstantFP>(CallOperandVal)) {
35597 weight = CW_Constant;
35601 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35602 if ((C->getSExtValue() >= -0x80000000LL) &&
35603 (C->getSExtValue() <= 0x7fffffffLL))
35604 weight = CW_Constant;
35608 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35609 if (C->getZExtValue() <= 0xffffffff)
35610 weight = CW_Constant;
35617 /// Try to replace an X constraint, which matches anything, with another that
35618 /// has more specific requirements based on the type of the corresponding
35620 const char *X86TargetLowering::
35621 LowerXConstraint(EVT ConstraintVT) const {
35622 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
35623 // 'f' like normal targets.
35624 if (ConstraintVT.isFloatingPoint()) {
35625 if (Subtarget.hasSSE2())
35627 if (Subtarget.hasSSE1())
35631 return TargetLowering::LowerXConstraint(ConstraintVT);
35634 /// Lower the specified operand into the Ops vector.
35635 /// If it is invalid, don't add anything to Ops.
35636 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
35637 std::string &Constraint,
35638 std::vector<SDValue>&Ops,
35639 SelectionDAG &DAG) const {
35642 // Only support length 1 constraints for now.
35643 if (Constraint.length() > 1) return;
35645 char ConstraintLetter = Constraint[0];
35646 switch (ConstraintLetter) {
35649 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35650 if (C->getZExtValue() <= 31) {
35651 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35652 Op.getValueType());
35658 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35659 if (C->getZExtValue() <= 63) {
35660 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35661 Op.getValueType());
35667 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35668 if (isInt<8>(C->getSExtValue())) {
35669 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35670 Op.getValueType());
35676 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35677 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
35678 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
35679 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
35680 Op.getValueType());
35686 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35687 if (C->getZExtValue() <= 3) {
35688 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35689 Op.getValueType());
35695 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35696 if (C->getZExtValue() <= 255) {
35697 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35698 Op.getValueType());
35704 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35705 if (C->getZExtValue() <= 127) {
35706 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35707 Op.getValueType());
35713 // 32-bit signed value
35714 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35715 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35716 C->getSExtValue())) {
35717 // Widen to 64 bits here to get it sign extended.
35718 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
35721 // FIXME gcc accepts some relocatable values here too, but only in certain
35722 // memory models; it's complicated.
35727 // 32-bit unsigned value
35728 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35729 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35730 C->getZExtValue())) {
35731 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35732 Op.getValueType());
35736 // FIXME gcc accepts some relocatable values here too, but only in certain
35737 // memory models; it's complicated.
35741 // Literal immediates are always ok.
35742 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
35743 // Widen to 64 bits here to get it sign extended.
35744 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
35748 // In any sort of PIC mode addresses need to be computed at runtime by
35749 // adding in a register or some sort of table lookup. These can't
35750 // be used as immediates.
35751 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
35754 // If we are in non-pic codegen mode, we allow the address of a global (with
35755 // an optional displacement) to be used with 'i'.
35756 GlobalAddressSDNode *GA = nullptr;
35757 int64_t Offset = 0;
35759 // Match either (GA), (GA+C), (GA+C1+C2), etc.
35761 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
35762 Offset += GA->getOffset();
35764 } else if (Op.getOpcode() == ISD::ADD) {
35765 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35766 Offset += C->getZExtValue();
35767 Op = Op.getOperand(0);
35770 } else if (Op.getOpcode() == ISD::SUB) {
35771 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35772 Offset += -C->getZExtValue();
35773 Op = Op.getOperand(0);
35778 // Otherwise, this isn't something we can handle, reject it.
35782 const GlobalValue *GV = GA->getGlobal();
35783 // If we require an extra load to get this address, as in PIC mode, we
35784 // can't accept it.
35785 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
35788 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
35789 GA->getValueType(0), Offset);
35794 if (Result.getNode()) {
35795 Ops.push_back(Result);
35798 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
35801 /// Check if \p RC is a general purpose register class.
35802 /// I.e., GR* or one of their variant.
35803 static bool isGRClass(const TargetRegisterClass &RC) {
35804 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
35805 RC.hasSuperClassEq(&X86::GR16RegClass) ||
35806 RC.hasSuperClassEq(&X86::GR32RegClass) ||
35807 RC.hasSuperClassEq(&X86::GR64RegClass) ||
35808 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
35811 /// Check if \p RC is a vector register class.
35812 /// I.e., FR* / VR* or one of their variant.
35813 static bool isFRClass(const TargetRegisterClass &RC) {
35814 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
35815 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
35816 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
35817 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
35818 RC.hasSuperClassEq(&X86::VR512RegClass);
35821 std::pair<unsigned, const TargetRegisterClass *>
35822 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
35823 StringRef Constraint,
35825 // First, see if this is a constraint that directly corresponds to an LLVM
35827 if (Constraint.size() == 1) {
35828 // GCC Constraint Letters
35829 switch (Constraint[0]) {
35831 // TODO: Slight differences here in allocation order and leaving
35832 // RIP in the class. Do they matter any more here than they do
35833 // in the normal allocation?
35835 if (Subtarget.hasAVX512()) {
35836 // Only supported in AVX512 or later.
35837 switch (VT.SimpleTy) {
35840 return std::make_pair(0U, &X86::VK32RegClass);
35842 return std::make_pair(0U, &X86::VK16RegClass);
35844 return std::make_pair(0U, &X86::VK8RegClass);
35846 return std::make_pair(0U, &X86::VK1RegClass);
35848 return std::make_pair(0U, &X86::VK64RegClass);
35852 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
35853 if (Subtarget.is64Bit()) {
35854 if (VT == MVT::i32 || VT == MVT::f32)
35855 return std::make_pair(0U, &X86::GR32RegClass);
35856 if (VT == MVT::i16)
35857 return std::make_pair(0U, &X86::GR16RegClass);
35858 if (VT == MVT::i8 || VT == MVT::i1)
35859 return std::make_pair(0U, &X86::GR8RegClass);
35860 if (VT == MVT::i64 || VT == MVT::f64)
35861 return std::make_pair(0U, &X86::GR64RegClass);
35864 // 32-bit fallthrough
35865 case 'Q': // Q_REGS
35866 if (VT == MVT::i32 || VT == MVT::f32)
35867 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
35868 if (VT == MVT::i16)
35869 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
35870 if (VT == MVT::i8 || VT == MVT::i1)
35871 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
35872 if (VT == MVT::i64)
35873 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
35875 case 'r': // GENERAL_REGS
35876 case 'l': // INDEX_REGS
35877 if (VT == MVT::i8 || VT == MVT::i1)
35878 return std::make_pair(0U, &X86::GR8RegClass);
35879 if (VT == MVT::i16)
35880 return std::make_pair(0U, &X86::GR16RegClass);
35881 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
35882 return std::make_pair(0U, &X86::GR32RegClass);
35883 return std::make_pair(0U, &X86::GR64RegClass);
35884 case 'R': // LEGACY_REGS
35885 if (VT == MVT::i8 || VT == MVT::i1)
35886 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
35887 if (VT == MVT::i16)
35888 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
35889 if (VT == MVT::i32 || !Subtarget.is64Bit())
35890 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
35891 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
35892 case 'f': // FP Stack registers.
35893 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
35894 // value to the correct fpstack register class.
35895 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
35896 return std::make_pair(0U, &X86::RFP32RegClass);
35897 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
35898 return std::make_pair(0U, &X86::RFP64RegClass);
35899 return std::make_pair(0U, &X86::RFP80RegClass);
35900 case 'y': // MMX_REGS if MMX allowed.
35901 if (!Subtarget.hasMMX()) break;
35902 return std::make_pair(0U, &X86::VR64RegClass);
35903 case 'Y': // SSE_REGS if SSE2 allowed
35904 if (!Subtarget.hasSSE2()) break;
35907 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
35908 if (!Subtarget.hasSSE1()) break;
35909 bool VConstraint = (Constraint[0] == 'v');
35911 switch (VT.SimpleTy) {
35913 // Scalar SSE types.
35916 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
35917 return std::make_pair(0U, &X86::FR32XRegClass);
35918 return std::make_pair(0U, &X86::FR32RegClass);
35921 if (VConstraint && Subtarget.hasVLX())
35922 return std::make_pair(0U, &X86::FR64XRegClass);
35923 return std::make_pair(0U, &X86::FR64RegClass);
35924 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
35932 if (VConstraint && Subtarget.hasVLX())
35933 return std::make_pair(0U, &X86::VR128XRegClass);
35934 return std::make_pair(0U, &X86::VR128RegClass);
35942 if (VConstraint && Subtarget.hasVLX())
35943 return std::make_pair(0U, &X86::VR256XRegClass);
35944 return std::make_pair(0U, &X86::VR256RegClass);
35949 return std::make_pair(0U, &X86::VR512RegClass);
35953 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
35954 switch (Constraint[1]) {
35958 // This register class doesn't allocate k0 for masked vector operation.
35959 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
35960 switch (VT.SimpleTy) {
35963 return std::make_pair(0U, &X86::VK32WMRegClass);
35965 return std::make_pair(0U, &X86::VK16WMRegClass);
35967 return std::make_pair(0U, &X86::VK8WMRegClass);
35969 return std::make_pair(0U, &X86::VK1WMRegClass);
35971 return std::make_pair(0U, &X86::VK64WMRegClass);
35978 // Use the default implementation in TargetLowering to convert the register
35979 // constraint into a member of a register class.
35980 std::pair<unsigned, const TargetRegisterClass*> Res;
35981 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
35983 // Not found as a standard register?
35985 // Map st(0) -> st(7) -> ST0
35986 if (Constraint.size() == 7 && Constraint[0] == '{' &&
35987 tolower(Constraint[1]) == 's' &&
35988 tolower(Constraint[2]) == 't' &&
35989 Constraint[3] == '(' &&
35990 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
35991 Constraint[5] == ')' &&
35992 Constraint[6] == '}') {
35994 Res.first = X86::FP0+Constraint[4]-'0';
35995 Res.second = &X86::RFP80RegClass;
35999 // GCC allows "st(0)" to be called just plain "st".
36000 if (StringRef("{st}").equals_lower(Constraint)) {
36001 Res.first = X86::FP0;
36002 Res.second = &X86::RFP80RegClass;
36007 if (StringRef("{flags}").equals_lower(Constraint)) {
36008 Res.first = X86::EFLAGS;
36009 Res.second = &X86::CCRRegClass;
36013 // 'A' means [ER]AX + [ER]DX.
36014 if (Constraint == "A") {
36015 if (Subtarget.is64Bit()) {
36016 Res.first = X86::RAX;
36017 Res.second = &X86::GR64_ADRegClass;
36019 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
36020 "Expecting 64, 32 or 16 bit subtarget");
36021 Res.first = X86::EAX;
36022 Res.second = &X86::GR32_ADRegClass;
36029 // Otherwise, check to see if this is a register class of the wrong value
36030 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
36031 // turn into {ax},{dx}.
36032 // MVT::Other is used to specify clobber names.
36033 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
36034 return Res; // Correct type already, nothing to do.
36036 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
36037 // return "eax". This should even work for things like getting 64bit integer
36038 // registers when given an f64 type.
36039 const TargetRegisterClass *Class = Res.second;
36040 // The generic code will match the first register class that contains the
36041 // given register. Thus, based on the ordering of the tablegened file,
36042 // the "plain" GR classes might not come first.
36043 // Therefore, use a helper method.
36044 if (isGRClass(*Class)) {
36045 unsigned Size = VT.getSizeInBits();
36046 if (Size == 1) Size = 8;
36047 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
36049 Res.first = DestReg;
36050 Res.second = Size == 8 ? &X86::GR8RegClass
36051 : Size == 16 ? &X86::GR16RegClass
36052 : Size == 32 ? &X86::GR32RegClass
36053 : &X86::GR64RegClass;
36054 assert(Res.second->contains(Res.first) && "Register in register class");
36056 // No register found/type mismatch.
36058 Res.second = nullptr;
36060 } else if (isFRClass(*Class)) {
36061 // Handle references to XMM physical registers that got mapped into the
36062 // wrong class. This can happen with constraints like {xmm0} where the
36063 // target independent register mapper will just pick the first match it can
36064 // find, ignoring the required type.
36066 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36067 if (VT == MVT::f32 || VT == MVT::i32)
36068 Res.second = &X86::FR32RegClass;
36069 else if (VT == MVT::f64 || VT == MVT::i64)
36070 Res.second = &X86::FR64RegClass;
36071 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
36072 Res.second = &X86::VR128RegClass;
36073 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
36074 Res.second = &X86::VR256RegClass;
36075 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
36076 Res.second = &X86::VR512RegClass;
36078 // Type mismatch and not a clobber: Return an error;
36080 Res.second = nullptr;
36087 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
36088 const AddrMode &AM, Type *Ty,
36089 unsigned AS) const {
36090 // Scaling factors are not free at all.
36091 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
36092 // will take 2 allocations in the out of order engine instead of 1
36093 // for plain addressing mode, i.e. inst (reg1).
36095 // vaddps (%rsi,%drx), %ymm0, %ymm1
36096 // Requires two allocations (one for the load, one for the computation)
36098 // vaddps (%rsi), %ymm0, %ymm1
36099 // Requires just 1 allocation, i.e., freeing allocations for other operations
36100 // and having less micro operations to execute.
36102 // For some X86 architectures, this is even worse because for instance for
36103 // stores, the complex addressing mode forces the instruction to use the
36104 // "load" ports instead of the dedicated "store" port.
36105 // E.g., on Haswell:
36106 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
36107 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
36108 if (isLegalAddressingMode(DL, AM, Ty, AS))
36109 // Scale represents reg2 * scale, thus account for 1
36110 // as soon as we use a second register.
36111 return AM.Scale != 0;
36115 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
36116 // Integer division on x86 is expensive. However, when aggressively optimizing
36117 // for code size, we prefer to use a div instruction, as it is usually smaller
36118 // than the alternative sequence.
36119 // The exception to this is vector division. Since x86 doesn't have vector
36120 // integer division, leaving the division as-is is a loss even in terms of
36121 // size, because it will have to be scalarized, while the alternative code
36122 // sequence can be performed in vector form.
36124 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
36125 return OptSize && !VT.isVector();
36128 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
36129 if (!Subtarget.is64Bit())
36132 // Update IsSplitCSR in X86MachineFunctionInfo.
36133 X86MachineFunctionInfo *AFI =
36134 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
36135 AFI->setIsSplitCSR(true);
36138 void X86TargetLowering::insertCopiesSplitCSR(
36139 MachineBasicBlock *Entry,
36140 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36141 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36142 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36146 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36147 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36148 MachineBasicBlock::iterator MBBI = Entry->begin();
36149 for (const MCPhysReg *I = IStart; *I; ++I) {
36150 const TargetRegisterClass *RC = nullptr;
36151 if (X86::GR64RegClass.contains(*I))
36152 RC = &X86::GR64RegClass;
36154 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
36156 unsigned NewVR = MRI->createVirtualRegister(RC);
36157 // Create copy from CSR to a virtual register.
36158 // FIXME: this currently does not emit CFI pseudo-instructions, it works
36159 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36160 // nounwind. If we want to generalize this later, we may need to emit
36161 // CFI pseudo-instructions.
36162 assert(Entry->getParent()->getFunction()->hasFnAttribute(
36163 Attribute::NoUnwind) &&
36164 "Function should be nounwind in insertCopiesSplitCSR!");
36165 Entry->addLiveIn(*I);
36166 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36169 // Insert the copy-back instructions right before the terminator.
36170 for (auto *Exit : Exits)
36171 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36172 TII->get(TargetOpcode::COPY), *I)
36177 bool X86TargetLowering::supportSwiftError() const {
36178 return Subtarget.is64Bit();