2 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
4 // The LLVM Compiler Infrastructure
6 // This file is distributed under the University of Illinois Open Source
7 // License. See LICENSE.TXT for details.
9 //===----------------------------------------------------------------------===//
11 // This file defines the interfaces that X86 uses to lower LLVM code into a
14 //===----------------------------------------------------------------------===//
16 #include "X86ISelLowering.h"
17 #include "Utils/X86ShuffleDecode.h"
18 #include "X86CallingConv.h"
19 #include "X86FrameLowering.h"
20 #include "X86InstrBuilder.h"
21 #include "X86IntrinsicsInfo.h"
22 #include "X86MachineFunctionInfo.h"
23 #include "X86ShuffleDecodeConstantPool.h"
24 #include "X86TargetMachine.h"
25 #include "X86TargetObjectFile.h"
26 #include "llvm/ADT/SmallBitVector.h"
27 #include "llvm/ADT/SmallSet.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringExtras.h"
30 #include "llvm/ADT/StringSwitch.h"
31 #include "llvm/Analysis/EHPersonalities.h"
32 #include "llvm/CodeGen/IntrinsicLowering.h"
33 #include "llvm/CodeGen/MachineFrameInfo.h"
34 #include "llvm/CodeGen/MachineFunction.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineJumpTableInfo.h"
37 #include "llvm/CodeGen/MachineModuleInfo.h"
38 #include "llvm/CodeGen/MachineRegisterInfo.h"
39 #include "llvm/CodeGen/WinEHFuncInfo.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/ErrorHandling.h"
57 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Support/MathExtras.h"
59 #include "llvm/Target/TargetLowering.h"
60 #include "llvm/Target/TargetOptions.h"
67 #define DEBUG_TYPE "x86-isel"
69 STATISTIC(NumTailCalls, "Number of tail calls");
71 static cl::opt<bool> ExperimentalVectorWideningLegalization(
72 "x86-experimental-vector-widening-legalization", cl::init(false),
73 cl::desc("Enable an experimental vector type legalization through widening "
74 "rather than promotion."),
77 static cl::opt<int> ExperimentalPrefLoopAlignment(
78 "x86-experimental-pref-loop-alignment", cl::init(4),
79 cl::desc("Sets the preferable loop alignment for experiments "
80 "(the last x86-experimental-pref-loop-alignment bits"
81 " of the loop header PC will be 0)."),
84 /// Call this when the user attempts to do something unsupported, like
85 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
86 /// report_fatal_error, so calling code should attempt to recover without
88 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
90 MachineFunction &MF = DAG.getMachineFunction();
91 DAG.getContext()->diagnose(
92 DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
95 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
96 const X86Subtarget &STI)
97 : TargetLowering(TM), Subtarget(STI) {
98 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
99 X86ScalarSSEf64 = Subtarget.hasSSE2();
100 X86ScalarSSEf32 = Subtarget.hasSSE1();
101 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
103 // Set up the TargetLowering object.
105 // X86 is weird. It always uses i8 for shift amounts and setcc results.
106 setBooleanContents(ZeroOrOneBooleanContent);
107 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
108 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
110 // For 64-bit, since we have so many registers, use the ILP scheduler.
111 // For 32-bit, use the register pressure specific scheduling.
112 // For Atom, always use ILP scheduling.
113 if (Subtarget.isAtom())
114 setSchedulingPreference(Sched::ILP);
115 else if (Subtarget.is64Bit())
116 setSchedulingPreference(Sched::ILP);
118 setSchedulingPreference(Sched::RegPressure);
119 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
120 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
122 // Bypass expensive divides and use cheaper ones.
123 if (TM.getOptLevel() >= CodeGenOpt::Default) {
124 if (Subtarget.hasSlowDivide32())
125 addBypassSlowDiv(32, 8);
126 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
127 addBypassSlowDiv(64, 32);
130 if (Subtarget.isTargetKnownWindowsMSVC() ||
131 Subtarget.isTargetWindowsItanium()) {
132 // Setup Windows compiler runtime calls.
133 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
134 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
135 setLibcallName(RTLIB::SREM_I64, "_allrem");
136 setLibcallName(RTLIB::UREM_I64, "_aullrem");
137 setLibcallName(RTLIB::MUL_I64, "_allmul");
138 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
139 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
140 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
141 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
142 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
145 if (Subtarget.isTargetDarwin()) {
146 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
147 setUseUnderscoreSetJmp(false);
148 setUseUnderscoreLongJmp(false);
149 } else if (Subtarget.isTargetWindowsGNU()) {
150 // MS runtime is weird: it exports _setjmp, but longjmp!
151 setUseUnderscoreSetJmp(true);
152 setUseUnderscoreLongJmp(false);
154 setUseUnderscoreSetJmp(true);
155 setUseUnderscoreLongJmp(true);
158 // Set up the register classes.
159 addRegisterClass(MVT::i8, &X86::GR8RegClass);
160 addRegisterClass(MVT::i16, &X86::GR16RegClass);
161 addRegisterClass(MVT::i32, &X86::GR32RegClass);
162 if (Subtarget.is64Bit())
163 addRegisterClass(MVT::i64, &X86::GR64RegClass);
165 for (MVT VT : MVT::integer_valuetypes())
166 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
168 // We don't accept any truncstore of integer registers.
169 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
170 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
171 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
172 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
173 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
174 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
176 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
178 // SETOEQ and SETUNE require checking two conditions.
179 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
180 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
181 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
182 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
183 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
184 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
186 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
188 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
189 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
190 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
192 if (Subtarget.is64Bit()) {
193 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
194 // f32/f64 are legal, f80 is custom.
195 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
197 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
198 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
199 } else if (!Subtarget.useSoftFloat()) {
200 // We have an algorithm for SSE2->double, and we turn this into a
201 // 64-bit FILD followed by conditional FADD for other targets.
202 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
203 // We have an algorithm for SSE2, and we turn this into a 64-bit
204 // FILD or VCVTUSI2SS/SD for other targets.
205 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
208 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
210 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
211 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
213 if (!Subtarget.useSoftFloat()) {
214 // SSE has no i16 to fp conversion, only i32.
215 if (X86ScalarSSEf32) {
216 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
217 // f32 and f64 cases are Legal, f80 case is not
218 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
220 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
221 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
224 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
225 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
228 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
230 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
231 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
233 if (!Subtarget.useSoftFloat()) {
234 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
235 // are Legal, f80 is custom lowered.
236 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
237 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
239 if (X86ScalarSSEf32) {
240 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
241 // f32 and f64 cases are Legal, f80 case is not
242 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
244 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
245 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
248 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
249 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
250 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
253 // Handle FP_TO_UINT by promoting the destination to a larger signed
255 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
256 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
257 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
259 if (Subtarget.is64Bit()) {
260 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
261 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
262 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
263 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
265 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
266 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
268 } else if (!Subtarget.useSoftFloat()) {
269 // Since AVX is a superset of SSE3, only check for SSE here.
270 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
271 // Expand FP_TO_UINT into a select.
272 // FIXME: We would like to use a Custom expander here eventually to do
273 // the optimal thing for SSE vs. the default expansion in the legalizer.
274 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
276 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
277 // With SSE3 we can use fisttpll to convert to a signed i64; without
278 // SSE, we're stuck with a fistpll.
279 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
281 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
284 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
285 if (!X86ScalarSSEf64) {
286 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
287 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
288 if (Subtarget.is64Bit()) {
289 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
290 // Without SSE, i64->f64 goes through memory.
291 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
293 } else if (!Subtarget.is64Bit())
294 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
296 // Scalar integer divide and remainder are lowered to use operations that
297 // produce two results, to match the available instructions. This exposes
298 // the two-result form to trivial CSE, which is able to combine x/y and x%y
299 // into a single instruction.
301 // Scalar integer multiply-high is also lowered to use two-result
302 // operations, to match the available instructions. However, plain multiply
303 // (low) operations are left as Legal, as there are single-result
304 // instructions for this in x86. Using the two-result multiply instructions
305 // when both high and low results are needed must be arranged by dagcombine.
306 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
307 setOperationAction(ISD::MULHS, VT, Expand);
308 setOperationAction(ISD::MULHU, VT, Expand);
309 setOperationAction(ISD::SDIV, VT, Expand);
310 setOperationAction(ISD::UDIV, VT, Expand);
311 setOperationAction(ISD::SREM, VT, Expand);
312 setOperationAction(ISD::UREM, VT, Expand);
315 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
316 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
317 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
318 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
319 setOperationAction(ISD::BR_CC, VT, Expand);
320 setOperationAction(ISD::SELECT_CC, VT, Expand);
322 if (Subtarget.is64Bit())
323 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
324 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
325 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
326 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
327 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
329 setOperationAction(ISD::FREM , MVT::f32 , Expand);
330 setOperationAction(ISD::FREM , MVT::f64 , Expand);
331 setOperationAction(ISD::FREM , MVT::f80 , Expand);
332 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
334 // Promote the i8 variants and force them on up to i32 which has a shorter
336 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
337 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
338 if (!Subtarget.hasBMI()) {
339 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
340 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
341 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
342 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
343 if (Subtarget.is64Bit()) {
344 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
345 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
349 if (Subtarget.hasLZCNT()) {
350 // When promoting the i8 variants, force them to i32 for a shorter
352 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
353 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
355 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
356 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
357 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
358 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
359 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
360 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
361 if (Subtarget.is64Bit()) {
362 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
363 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
367 // Special handling for half-precision floating point conversions.
368 // If we don't have F16C support, then lower half float conversions
369 // into library calls.
370 if (Subtarget.useSoftFloat() ||
371 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
372 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
373 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
376 // There's never any support for operations beyond MVT::f32.
377 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
378 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
379 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
380 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
382 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
383 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
384 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
385 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
386 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
387 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
389 if (Subtarget.hasPOPCNT()) {
390 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
392 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
393 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
394 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
395 if (Subtarget.is64Bit())
396 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
399 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
401 if (!Subtarget.hasMOVBE())
402 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
404 // These should be promoted to a larger select which is supported.
405 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
406 // X86 wants to expand cmov itself.
407 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
408 setOperationAction(ISD::SELECT, VT, Custom);
409 setOperationAction(ISD::SETCC, VT, Custom);
411 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
412 if (VT == MVT::i64 && !Subtarget.is64Bit())
414 setOperationAction(ISD::SELECT, VT, Custom);
415 setOperationAction(ISD::SETCC, VT, Custom);
417 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
418 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
419 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
420 // support continuation, user-level threading, and etc.. As a result, no
421 // other SjLj exception interfaces are implemented and please don't build
422 // your own exception handling based on them.
423 // LLVM/Clang supports zero-cost DWARF exception handling.
424 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
425 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
426 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
427 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
428 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
431 for (auto VT : { MVT::i32, MVT::i64 }) {
432 if (VT == MVT::i64 && !Subtarget.is64Bit())
434 setOperationAction(ISD::ConstantPool , VT, Custom);
435 setOperationAction(ISD::JumpTable , VT, Custom);
436 setOperationAction(ISD::GlobalAddress , VT, Custom);
437 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
438 setOperationAction(ISD::ExternalSymbol , VT, Custom);
439 setOperationAction(ISD::BlockAddress , VT, Custom);
442 // 64-bit shl, sra, srl (iff 32-bit x86)
443 for (auto VT : { MVT::i32, MVT::i64 }) {
444 if (VT == MVT::i64 && !Subtarget.is64Bit())
446 setOperationAction(ISD::SHL_PARTS, VT, Custom);
447 setOperationAction(ISD::SRA_PARTS, VT, Custom);
448 setOperationAction(ISD::SRL_PARTS, VT, Custom);
451 if (Subtarget.hasSSE1())
452 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
454 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
456 // Expand certain atomics
457 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
458 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
459 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
460 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
461 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
462 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
463 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
464 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
467 if (Subtarget.hasCmpxchg16b()) {
468 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
471 // FIXME - use subtarget debug flags
472 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
473 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
474 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
475 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
478 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
479 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
481 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
482 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
484 setOperationAction(ISD::TRAP, MVT::Other, Legal);
485 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
487 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
488 setOperationAction(ISD::VASTART , MVT::Other, Custom);
489 setOperationAction(ISD::VAEND , MVT::Other, Expand);
490 bool Is64Bit = Subtarget.is64Bit();
491 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
492 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
494 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
495 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
497 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
499 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
500 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
501 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
503 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
504 // f32 and f64 use SSE.
505 // Set up the FP register classes.
506 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
507 : &X86::FR32RegClass);
508 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
509 : &X86::FR64RegClass);
511 for (auto VT : { MVT::f32, MVT::f64 }) {
512 // Use ANDPD to simulate FABS.
513 setOperationAction(ISD::FABS, VT, Custom);
515 // Use XORP to simulate FNEG.
516 setOperationAction(ISD::FNEG, VT, Custom);
518 // Use ANDPD and ORPD to simulate FCOPYSIGN.
519 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
521 // We don't support sin/cos/fmod
522 setOperationAction(ISD::FSIN , VT, Expand);
523 setOperationAction(ISD::FCOS , VT, Expand);
524 setOperationAction(ISD::FSINCOS, VT, Expand);
527 // Lower this to MOVMSK plus an AND.
528 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
529 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
531 // Expand FP immediates into loads from the stack, except for the special
533 addLegalFPImmediate(APFloat(+0.0)); // xorpd
534 addLegalFPImmediate(APFloat(+0.0f)); // xorps
535 } else if (UseX87 && X86ScalarSSEf32) {
536 // Use SSE for f32, x87 for f64.
537 // Set up the FP register classes.
538 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
539 : &X86::FR32RegClass);
540 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
542 // Use ANDPS to simulate FABS.
543 setOperationAction(ISD::FABS , MVT::f32, Custom);
545 // Use XORP to simulate FNEG.
546 setOperationAction(ISD::FNEG , MVT::f32, Custom);
548 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
550 // Use ANDPS and ORPS to simulate FCOPYSIGN.
551 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
552 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
554 // We don't support sin/cos/fmod
555 setOperationAction(ISD::FSIN , MVT::f32, Expand);
556 setOperationAction(ISD::FCOS , MVT::f32, Expand);
557 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
559 // Special cases we handle for FP constants.
560 addLegalFPImmediate(APFloat(+0.0f)); // xorps
561 addLegalFPImmediate(APFloat(+0.0)); // FLD0
562 addLegalFPImmediate(APFloat(+1.0)); // FLD1
563 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
564 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
566 if (!TM.Options.UnsafeFPMath) {
567 setOperationAction(ISD::FSIN , MVT::f64, Expand);
568 setOperationAction(ISD::FCOS , MVT::f64, Expand);
569 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
572 // f32 and f64 in x87.
573 // Set up the FP register classes.
574 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
575 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
577 for (auto VT : { MVT::f32, MVT::f64 }) {
578 setOperationAction(ISD::UNDEF, VT, Expand);
579 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
581 if (!TM.Options.UnsafeFPMath) {
582 setOperationAction(ISD::FSIN , VT, Expand);
583 setOperationAction(ISD::FCOS , VT, Expand);
584 setOperationAction(ISD::FSINCOS, VT, Expand);
587 addLegalFPImmediate(APFloat(+0.0)); // FLD0
588 addLegalFPImmediate(APFloat(+1.0)); // FLD1
589 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
590 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
591 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
592 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
593 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
594 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
597 // We don't support FMA.
598 setOperationAction(ISD::FMA, MVT::f64, Expand);
599 setOperationAction(ISD::FMA, MVT::f32, Expand);
601 // Long double always uses X87, except f128 in MMX.
603 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
604 addRegisterClass(MVT::f128, &X86::FR128RegClass);
605 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
606 setOperationAction(ISD::FABS , MVT::f128, Custom);
607 setOperationAction(ISD::FNEG , MVT::f128, Custom);
608 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
611 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
612 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
613 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
615 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
616 addLegalFPImmediate(TmpFlt); // FLD0
618 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
621 APFloat TmpFlt2(+1.0);
622 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
624 addLegalFPImmediate(TmpFlt2); // FLD1
625 TmpFlt2.changeSign();
626 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
629 if (!TM.Options.UnsafeFPMath) {
630 setOperationAction(ISD::FSIN , MVT::f80, Expand);
631 setOperationAction(ISD::FCOS , MVT::f80, Expand);
632 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
635 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
636 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
637 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
638 setOperationAction(ISD::FRINT, MVT::f80, Expand);
639 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
640 setOperationAction(ISD::FMA, MVT::f80, Expand);
643 // Always use a library call for pow.
644 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
645 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
646 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
648 setOperationAction(ISD::FLOG, MVT::f80, Expand);
649 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
650 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
651 setOperationAction(ISD::FEXP, MVT::f80, Expand);
652 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
653 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
654 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
656 // Some FP actions are always expanded for vector types.
657 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
658 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
659 setOperationAction(ISD::FSIN, VT, Expand);
660 setOperationAction(ISD::FSINCOS, VT, Expand);
661 setOperationAction(ISD::FCOS, VT, Expand);
662 setOperationAction(ISD::FREM, VT, Expand);
663 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
664 setOperationAction(ISD::FPOW, VT, Expand);
665 setOperationAction(ISD::FLOG, VT, Expand);
666 setOperationAction(ISD::FLOG2, VT, Expand);
667 setOperationAction(ISD::FLOG10, VT, Expand);
668 setOperationAction(ISD::FEXP, VT, Expand);
669 setOperationAction(ISD::FEXP2, VT, Expand);
672 // First set operation action for all vector types to either promote
673 // (for widening) or expand (for scalarization). Then we will selectively
674 // turn on ones that can be effectively codegen'd.
675 for (MVT VT : MVT::vector_valuetypes()) {
676 setOperationAction(ISD::SDIV, VT, Expand);
677 setOperationAction(ISD::UDIV, VT, Expand);
678 setOperationAction(ISD::SREM, VT, Expand);
679 setOperationAction(ISD::UREM, VT, Expand);
680 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
681 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
682 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
683 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
684 setOperationAction(ISD::FMA, VT, Expand);
685 setOperationAction(ISD::FFLOOR, VT, Expand);
686 setOperationAction(ISD::FCEIL, VT, Expand);
687 setOperationAction(ISD::FTRUNC, VT, Expand);
688 setOperationAction(ISD::FRINT, VT, Expand);
689 setOperationAction(ISD::FNEARBYINT, VT, Expand);
690 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
691 setOperationAction(ISD::MULHS, VT, Expand);
692 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
693 setOperationAction(ISD::MULHU, VT, Expand);
694 setOperationAction(ISD::SDIVREM, VT, Expand);
695 setOperationAction(ISD::UDIVREM, VT, Expand);
696 setOperationAction(ISD::CTPOP, VT, Expand);
697 setOperationAction(ISD::CTTZ, VT, Expand);
698 setOperationAction(ISD::CTLZ, VT, Expand);
699 setOperationAction(ISD::ROTL, VT, Expand);
700 setOperationAction(ISD::ROTR, VT, Expand);
701 setOperationAction(ISD::BSWAP, VT, Expand);
702 setOperationAction(ISD::SETCC, VT, Expand);
703 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
704 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
705 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
706 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
707 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
708 setOperationAction(ISD::TRUNCATE, VT, Expand);
709 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
710 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
711 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
712 setOperationAction(ISD::SELECT_CC, VT, Expand);
713 for (MVT InnerVT : MVT::vector_valuetypes()) {
714 setTruncStoreAction(InnerVT, VT, Expand);
716 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
717 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
719 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
720 // types, we have to deal with them whether we ask for Expansion or not.
721 // Setting Expand causes its own optimisation problems though, so leave
723 if (VT.getVectorElementType() == MVT::i1)
724 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
726 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
727 // split/scalarized right now.
728 if (VT.getVectorElementType() == MVT::f16)
729 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
733 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
734 // with -msoft-float, disable use of MMX as well.
735 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
736 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
737 // No operations on x86mmx supported, everything uses intrinsics.
740 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
741 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
742 : &X86::VR128RegClass);
744 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
745 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
746 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
747 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
748 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
749 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
750 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
751 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
752 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
755 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
756 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
757 : &X86::VR128RegClass);
759 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
760 // registers cannot be used even for integer operations.
761 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
762 : &X86::VR128RegClass);
763 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
764 : &X86::VR128RegClass);
765 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
766 : &X86::VR128RegClass);
767 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
768 : &X86::VR128RegClass);
770 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
771 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
772 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
773 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
774 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
775 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
776 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
777 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
778 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
779 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
780 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
781 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
782 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
784 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
785 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
786 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
787 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
789 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
790 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
791 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
793 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
794 setOperationAction(ISD::SETCC, VT, Custom);
795 setOperationAction(ISD::CTPOP, VT, Custom);
796 setOperationAction(ISD::CTTZ, VT, Custom);
799 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
800 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
801 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
802 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
803 setOperationAction(ISD::VSELECT, VT, Custom);
804 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
807 // We support custom legalizing of sext and anyext loads for specific
808 // memory vector types which we can load as a scalar (or sequence of
809 // scalars) and extend in-register to a legal 128-bit vector type. For sext
810 // loads these must work with a single scalar load.
811 for (MVT VT : MVT::integer_vector_valuetypes()) {
812 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
813 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
814 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
815 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
816 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
817 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
818 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
819 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
820 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
823 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
824 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
825 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
826 setOperationAction(ISD::VSELECT, VT, Custom);
828 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
831 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
832 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
835 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
836 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
837 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
838 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
839 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
840 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
841 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
844 // Custom lower v2i64 and v2f64 selects.
845 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
846 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
848 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
849 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
851 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
852 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
854 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
855 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
856 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
858 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
859 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
861 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
862 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
864 for (MVT VT : MVT::fp_vector_valuetypes())
865 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
867 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
868 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
869 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
871 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
872 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
873 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
875 // In the customized shift lowering, the legal v4i32/v2i64 cases
876 // in AVX2 will be recognized.
877 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
878 setOperationAction(ISD::SRL, VT, Custom);
879 setOperationAction(ISD::SHL, VT, Custom);
880 setOperationAction(ISD::SRA, VT, Custom);
884 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
885 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
886 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
887 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
888 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
889 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
890 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
891 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
892 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
895 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
896 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
897 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
898 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
899 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
900 setOperationAction(ISD::FRINT, RoundedTy, Legal);
901 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
904 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
905 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
906 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
907 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
908 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
909 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
910 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
911 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
913 // FIXME: Do we need to handle scalar-to-vector here?
914 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
916 // We directly match byte blends in the backend as they match the VSELECT
918 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
920 // SSE41 brings specific instructions for doing vector sign extend even in
921 // cases where we don't have SRA.
922 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
923 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
924 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
927 for (MVT VT : MVT::integer_vector_valuetypes()) {
928 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
929 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
930 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
933 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
934 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
935 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
936 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
937 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
938 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
939 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
940 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
943 // i8 vectors are custom because the source register and source
944 // source memory operand types are not the same width.
945 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
948 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
949 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
950 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
951 setOperationAction(ISD::ROTL, VT, Custom);
953 // XOP can efficiently perform BITREVERSE with VPPERM.
954 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
955 setOperationAction(ISD::BITREVERSE, VT, Custom);
957 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
958 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
959 setOperationAction(ISD::BITREVERSE, VT, Custom);
962 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
963 bool HasInt256 = Subtarget.hasInt256();
965 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
966 : &X86::VR256RegClass);
967 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
968 : &X86::VR256RegClass);
969 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
970 : &X86::VR256RegClass);
971 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
972 : &X86::VR256RegClass);
973 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
974 : &X86::VR256RegClass);
975 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
976 : &X86::VR256RegClass);
978 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
979 setOperationAction(ISD::FFLOOR, VT, Legal);
980 setOperationAction(ISD::FCEIL, VT, Legal);
981 setOperationAction(ISD::FTRUNC, VT, Legal);
982 setOperationAction(ISD::FRINT, VT, Legal);
983 setOperationAction(ISD::FNEARBYINT, VT, Legal);
984 setOperationAction(ISD::FNEG, VT, Custom);
985 setOperationAction(ISD::FABS, VT, Custom);
986 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
989 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
990 // even though v8i16 is a legal type.
991 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
992 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
993 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
995 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
996 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
997 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
999 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1000 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1002 for (MVT VT : MVT::fp_vector_valuetypes())
1003 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1005 // In the customized shift lowering, the legal v8i32/v4i64 cases
1006 // in AVX2 will be recognized.
1007 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1008 setOperationAction(ISD::SRL, VT, Custom);
1009 setOperationAction(ISD::SHL, VT, Custom);
1010 setOperationAction(ISD::SRA, VT, Custom);
1013 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1014 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1015 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1017 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1018 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1019 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1020 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1023 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1024 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1025 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1026 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1028 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1029 setOperationAction(ISD::SETCC, VT, Custom);
1030 setOperationAction(ISD::CTPOP, VT, Custom);
1031 setOperationAction(ISD::CTTZ, VT, Custom);
1032 setOperationAction(ISD::CTLZ, VT, Custom);
1035 if (Subtarget.hasAnyFMA()) {
1036 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1037 MVT::v2f64, MVT::v4f64 })
1038 setOperationAction(ISD::FMA, VT, Legal);
1041 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1042 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1043 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1046 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1047 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1048 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1049 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1051 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1052 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1054 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1055 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1056 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1057 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1059 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1060 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1061 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1062 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1063 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1064 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1068 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1069 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1070 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1072 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1073 // when we have a 256bit-wide blend with immediate.
1074 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1076 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1077 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1078 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1079 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1080 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1081 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1082 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1083 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1087 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1088 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1089 setOperationAction(ISD::MLOAD, VT, Legal);
1090 setOperationAction(ISD::MSTORE, VT, Legal);
1093 // Extract subvector is special because the value type
1094 // (result) is 128-bit but the source is 256-bit wide.
1095 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1096 MVT::v4f32, MVT::v2f64 }) {
1097 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1100 // Custom lower several nodes for 256-bit types.
1101 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1102 MVT::v8f32, MVT::v4f64 }) {
1103 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1104 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1105 setOperationAction(ISD::VSELECT, VT, Custom);
1106 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1107 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1108 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1109 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1110 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1114 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1116 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1117 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1118 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1119 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1120 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1121 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1122 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1126 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1127 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1128 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1129 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1130 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1132 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1133 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1134 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1136 for (MVT VT : MVT::fp_vector_valuetypes())
1137 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1139 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1140 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1141 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1142 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1143 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1144 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1145 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1148 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1149 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1150 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1151 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1152 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1153 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1154 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1155 setTruncStoreAction(VT, MaskVT, Custom);
1158 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1159 setOperationAction(ISD::FNEG, VT, Custom);
1160 setOperationAction(ISD::FABS, VT, Custom);
1161 setOperationAction(ISD::FMA, VT, Legal);
1162 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1165 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1166 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1167 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1168 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1169 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1170 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1171 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1172 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1173 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1174 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1175 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1176 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1177 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1178 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1179 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1180 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1181 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1182 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1183 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1184 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1185 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1186 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1187 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1188 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1189 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1191 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1192 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1193 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1194 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1195 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1196 if (Subtarget.hasVLX()){
1197 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1198 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1199 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1200 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1201 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1203 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1204 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1205 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1206 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1207 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1209 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1210 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1211 setOperationAction(ISD::MLOAD, VT, Custom);
1212 setOperationAction(ISD::MSTORE, VT, Custom);
1215 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1216 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1218 if (Subtarget.hasDQI()) {
1219 for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1220 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1221 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1222 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1223 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1225 if (Subtarget.hasVLX()) {
1226 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1227 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1228 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1229 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1232 if (Subtarget.hasVLX()) {
1233 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1234 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1235 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1236 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1237 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1238 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1239 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1240 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1241 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1242 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1243 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1245 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1246 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1247 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1248 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1249 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1250 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1251 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1252 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1253 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1254 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1255 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1258 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1259 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1260 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1261 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1262 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1263 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1264 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1265 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1266 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1267 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1269 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1270 setOperationAction(ISD::FFLOOR, VT, Legal);
1271 setOperationAction(ISD::FCEIL, VT, Legal);
1272 setOperationAction(ISD::FTRUNC, VT, Legal);
1273 setOperationAction(ISD::FRINT, VT, Legal);
1274 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1277 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1278 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1280 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1281 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1282 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1284 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1285 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1286 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1287 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1288 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1290 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1292 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1293 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1294 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1295 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1296 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1297 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1299 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1301 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1302 setOperationAction(ISD::ABS, MVT::v4i64, Legal);
1303 setOperationAction(ISD::ABS, MVT::v2i64, Legal);
1305 for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1306 setOperationAction(ISD::ADD, VT, Custom);
1307 setOperationAction(ISD::SUB, VT, Custom);
1308 setOperationAction(ISD::MUL, VT, Custom);
1309 setOperationAction(ISD::SETCC, VT, Custom);
1310 setOperationAction(ISD::SELECT, VT, Custom);
1311 setOperationAction(ISD::TRUNCATE, VT, Custom);
1313 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1314 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1315 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1316 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1317 setOperationAction(ISD::VSELECT, VT, Expand);
1320 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1321 setOperationAction(ISD::SMAX, VT, Legal);
1322 setOperationAction(ISD::UMAX, VT, Legal);
1323 setOperationAction(ISD::SMIN, VT, Legal);
1324 setOperationAction(ISD::UMIN, VT, Legal);
1325 setOperationAction(ISD::ABS, VT, Legal);
1326 setOperationAction(ISD::SRL, VT, Custom);
1327 setOperationAction(ISD::SHL, VT, Custom);
1328 setOperationAction(ISD::SRA, VT, Custom);
1329 setOperationAction(ISD::CTPOP, VT, Custom);
1330 setOperationAction(ISD::CTTZ, VT, Custom);
1333 // Need to promote to 64-bit even though we have 32-bit masked instructions
1334 // because the IR optimizers rearrange bitcasts around logic ops leaving
1335 // too many variations to handle if we don't promote them.
1336 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1337 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1338 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1340 if (Subtarget.hasCDI()) {
1341 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1342 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1343 MVT::v4i64, MVT::v8i64}) {
1344 setOperationAction(ISD::CTLZ, VT, Legal);
1345 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1347 } // Subtarget.hasCDI()
1349 if (Subtarget.hasDQI()) {
1350 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1351 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1352 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1353 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1356 if (Subtarget.hasVPOPCNTDQ()) {
1357 // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1358 // version of popcntd/q.
1359 for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1360 MVT::v4i32, MVT::v2i64})
1361 setOperationAction(ISD::CTPOP, VT, Legal);
1364 // Custom lower several nodes.
1365 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1366 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1367 setOperationAction(ISD::MGATHER, VT, Custom);
1368 setOperationAction(ISD::MSCATTER, VT, Custom);
1370 // Extract subvector is special because the value type
1371 // (result) is 256-bit but the source is 512-bit wide.
1372 // 128-bit was made Custom under AVX1.
1373 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1374 MVT::v8f32, MVT::v4f64 })
1375 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1376 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1377 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1378 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1380 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1381 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1382 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1383 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1384 setOperationAction(ISD::VSELECT, VT, Custom);
1385 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1386 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1387 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1388 setOperationAction(ISD::MLOAD, VT, Legal);
1389 setOperationAction(ISD::MSTORE, VT, Legal);
1390 setOperationAction(ISD::MGATHER, VT, Legal);
1391 setOperationAction(ISD::MSCATTER, VT, Custom);
1393 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1394 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1395 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1399 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1400 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1401 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1403 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1404 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1406 setOperationAction(ISD::ADD, MVT::v32i1, Custom);
1407 setOperationAction(ISD::ADD, MVT::v64i1, Custom);
1408 setOperationAction(ISD::SUB, MVT::v32i1, Custom);
1409 setOperationAction(ISD::SUB, MVT::v64i1, Custom);
1410 setOperationAction(ISD::MUL, MVT::v32i1, Custom);
1411 setOperationAction(ISD::MUL, MVT::v64i1, Custom);
1413 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1414 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1415 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1416 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1417 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1418 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1419 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1420 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1421 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1422 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1423 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1424 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1425 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1426 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1427 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1428 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1429 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1430 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1431 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1432 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1433 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1434 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1435 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1436 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1437 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1438 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1439 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1440 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1441 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1442 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1443 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1444 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1445 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1446 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1447 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1448 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1449 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1450 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1451 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1452 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1453 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1454 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1455 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1456 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1457 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1459 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1461 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1462 if (Subtarget.hasVLX()) {
1463 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1464 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1467 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1468 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1469 setOperationAction(ISD::MLOAD, VT, Action);
1470 setOperationAction(ISD::MSTORE, VT, Action);
1473 if (Subtarget.hasCDI()) {
1474 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1475 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1478 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1479 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1480 setOperationAction(ISD::VSELECT, VT, Custom);
1481 setOperationAction(ISD::ABS, VT, Legal);
1482 setOperationAction(ISD::SRL, VT, Custom);
1483 setOperationAction(ISD::SHL, VT, Custom);
1484 setOperationAction(ISD::SRA, VT, Custom);
1485 setOperationAction(ISD::MLOAD, VT, Legal);
1486 setOperationAction(ISD::MSTORE, VT, Legal);
1487 setOperationAction(ISD::CTPOP, VT, Custom);
1488 setOperationAction(ISD::CTTZ, VT, Custom);
1489 setOperationAction(ISD::SMAX, VT, Legal);
1490 setOperationAction(ISD::UMAX, VT, Legal);
1491 setOperationAction(ISD::SMIN, VT, Legal);
1492 setOperationAction(ISD::UMIN, VT, Legal);
1494 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1495 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1496 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1499 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1500 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1501 if (Subtarget.hasVLX()) {
1502 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1503 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1504 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1509 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1510 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1511 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1513 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1514 setOperationAction(ISD::ADD, VT, Custom);
1515 setOperationAction(ISD::SUB, VT, Custom);
1516 setOperationAction(ISD::MUL, VT, Custom);
1517 setOperationAction(ISD::VSELECT, VT, Expand);
1519 setOperationAction(ISD::TRUNCATE, VT, Custom);
1520 setOperationAction(ISD::SETCC, VT, Custom);
1521 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1522 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1523 setOperationAction(ISD::SELECT, VT, Custom);
1524 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1525 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1528 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1529 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1530 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1531 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1533 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1534 setOperationAction(ISD::SMAX, VT, Legal);
1535 setOperationAction(ISD::UMAX, VT, Legal);
1536 setOperationAction(ISD::SMIN, VT, Legal);
1537 setOperationAction(ISD::UMIN, VT, Legal);
1541 // We want to custom lower some of our intrinsics.
1542 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1543 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1544 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1545 if (!Subtarget.is64Bit()) {
1546 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1547 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1550 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1551 // handle type legalization for these operations here.
1553 // FIXME: We really should do custom legalization for addition and
1554 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1555 // than generic legalization for 64-bit multiplication-with-overflow, though.
1556 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1557 if (VT == MVT::i64 && !Subtarget.is64Bit())
1559 // Add/Sub/Mul with overflow operations are custom lowered.
1560 setOperationAction(ISD::SADDO, VT, Custom);
1561 setOperationAction(ISD::UADDO, VT, Custom);
1562 setOperationAction(ISD::SSUBO, VT, Custom);
1563 setOperationAction(ISD::USUBO, VT, Custom);
1564 setOperationAction(ISD::SMULO, VT, Custom);
1565 setOperationAction(ISD::UMULO, VT, Custom);
1567 // Support carry in as value rather than glue.
1568 setOperationAction(ISD::ADDCARRY, VT, Custom);
1569 setOperationAction(ISD::SUBCARRY, VT, Custom);
1570 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1573 if (!Subtarget.is64Bit()) {
1574 // These libcalls are not available in 32-bit.
1575 setLibcallName(RTLIB::SHL_I128, nullptr);
1576 setLibcallName(RTLIB::SRL_I128, nullptr);
1577 setLibcallName(RTLIB::SRA_I128, nullptr);
1580 // Combine sin / cos into one node or libcall if possible.
1581 if (Subtarget.hasSinCos()) {
1582 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1583 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1584 if (Subtarget.isTargetDarwin()) {
1585 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1586 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1587 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1588 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1592 if (Subtarget.isTargetWin64()) {
1593 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1594 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1595 setOperationAction(ISD::SREM, MVT::i128, Custom);
1596 setOperationAction(ISD::UREM, MVT::i128, Custom);
1597 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1598 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1601 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1602 // is. We should promote the value to 64-bits to solve this.
1603 // This is what the CRT headers do - `fmodf` is an inline header
1604 // function casting to f64 and calling `fmod`.
1605 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1606 Subtarget.isTargetWindowsItanium()))
1607 for (ISD::NodeType Op :
1608 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1609 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1610 if (isOperationExpand(Op, MVT::f32))
1611 setOperationAction(Op, MVT::f32, Promote);
1613 // We have target-specific dag combine patterns for the following nodes:
1614 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1615 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1616 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1617 setTargetDAGCombine(ISD::BITCAST);
1618 setTargetDAGCombine(ISD::VSELECT);
1619 setTargetDAGCombine(ISD::SELECT);
1620 setTargetDAGCombine(ISD::SHL);
1621 setTargetDAGCombine(ISD::SRA);
1622 setTargetDAGCombine(ISD::SRL);
1623 setTargetDAGCombine(ISD::OR);
1624 setTargetDAGCombine(ISD::AND);
1625 setTargetDAGCombine(ISD::ADD);
1626 setTargetDAGCombine(ISD::FADD);
1627 setTargetDAGCombine(ISD::FSUB);
1628 setTargetDAGCombine(ISD::FNEG);
1629 setTargetDAGCombine(ISD::FMA);
1630 setTargetDAGCombine(ISD::FMINNUM);
1631 setTargetDAGCombine(ISD::FMAXNUM);
1632 setTargetDAGCombine(ISD::SUB);
1633 setTargetDAGCombine(ISD::LOAD);
1634 setTargetDAGCombine(ISD::MLOAD);
1635 setTargetDAGCombine(ISD::STORE);
1636 setTargetDAGCombine(ISD::MSTORE);
1637 setTargetDAGCombine(ISD::TRUNCATE);
1638 setTargetDAGCombine(ISD::ZERO_EXTEND);
1639 setTargetDAGCombine(ISD::ANY_EXTEND);
1640 setTargetDAGCombine(ISD::SIGN_EXTEND);
1641 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1642 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1643 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1644 setTargetDAGCombine(ISD::SINT_TO_FP);
1645 setTargetDAGCombine(ISD::UINT_TO_FP);
1646 setTargetDAGCombine(ISD::SETCC);
1647 setTargetDAGCombine(ISD::MUL);
1648 setTargetDAGCombine(ISD::XOR);
1649 setTargetDAGCombine(ISD::MSCATTER);
1650 setTargetDAGCombine(ISD::MGATHER);
1652 computeRegisterProperties(Subtarget.getRegisterInfo());
1654 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1655 MaxStoresPerMemsetOptSize = 8;
1656 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1657 MaxStoresPerMemcpyOptSize = 4;
1658 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1659 MaxStoresPerMemmoveOptSize = 4;
1660 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1661 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1663 // An out-of-order CPU can speculatively execute past a predictable branch,
1664 // but a conditional move could be stalled by an expensive earlier operation.
1665 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1666 EnableExtLdPromotion = true;
1667 setPrefFunctionAlignment(4); // 2^4 bytes.
1669 verifyIntrinsicTables();
1672 // This has so far only been implemented for 64-bit MachO.
1673 bool X86TargetLowering::useLoadStackGuardNode() const {
1674 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1677 TargetLoweringBase::LegalizeTypeAction
1678 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1679 if (ExperimentalVectorWideningLegalization &&
1680 VT.getVectorNumElements() != 1 &&
1681 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1682 return TypeWidenVector;
1684 return TargetLoweringBase::getPreferredVectorAction(VT);
1687 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1688 LLVMContext& Context,
1693 if (VT.isSimple()) {
1694 MVT VVT = VT.getSimpleVT();
1695 const unsigned NumElts = VVT.getVectorNumElements();
1696 MVT EltVT = VVT.getVectorElementType();
1697 if (VVT.is512BitVector()) {
1698 if (Subtarget.hasAVX512())
1699 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1700 EltVT == MVT::f32 || EltVT == MVT::f64)
1702 case 8: return MVT::v8i1;
1703 case 16: return MVT::v16i1;
1705 if (Subtarget.hasBWI())
1706 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1708 case 32: return MVT::v32i1;
1709 case 64: return MVT::v64i1;
1713 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1714 return MVT::getVectorVT(MVT::i1, NumElts);
1716 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1717 EVT LegalVT = getTypeToTransformTo(Context, VT);
1718 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1721 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1723 case 2: return MVT::v2i1;
1724 case 4: return MVT::v4i1;
1725 case 8: return MVT::v8i1;
1729 return VT.changeVectorElementTypeToInteger();
1732 /// Helper for getByValTypeAlignment to determine
1733 /// the desired ByVal argument alignment.
1734 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1737 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1738 if (VTy->getBitWidth() == 128)
1740 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1741 unsigned EltAlign = 0;
1742 getMaxByValAlign(ATy->getElementType(), EltAlign);
1743 if (EltAlign > MaxAlign)
1744 MaxAlign = EltAlign;
1745 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1746 for (auto *EltTy : STy->elements()) {
1747 unsigned EltAlign = 0;
1748 getMaxByValAlign(EltTy, EltAlign);
1749 if (EltAlign > MaxAlign)
1750 MaxAlign = EltAlign;
1757 /// Return the desired alignment for ByVal aggregate
1758 /// function arguments in the caller parameter area. For X86, aggregates
1759 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1760 /// are at 4-byte boundaries.
1761 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1762 const DataLayout &DL) const {
1763 if (Subtarget.is64Bit()) {
1764 // Max of 8 and alignment of type.
1765 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1772 if (Subtarget.hasSSE1())
1773 getMaxByValAlign(Ty, Align);
1777 /// Returns the target specific optimal type for load
1778 /// and store operations as a result of memset, memcpy, and memmove
1779 /// lowering. If DstAlign is zero that means it's safe to destination
1780 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1781 /// means there isn't a need to check it against alignment requirement,
1782 /// probably because the source does not need to be loaded. If 'IsMemset' is
1783 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1784 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1785 /// source is constant so it does not need to be loaded.
1786 /// It returns EVT::Other if the type should be determined using generic
1787 /// target-independent logic.
1789 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1790 unsigned DstAlign, unsigned SrcAlign,
1791 bool IsMemset, bool ZeroMemset,
1793 MachineFunction &MF) const {
1794 const Function *F = MF.getFunction();
1795 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1797 (!Subtarget.isUnalignedMem16Slow() ||
1798 ((DstAlign == 0 || DstAlign >= 16) &&
1799 (SrcAlign == 0 || SrcAlign >= 16)))) {
1800 // FIXME: Check if unaligned 32-byte accesses are slow.
1801 if (Size >= 32 && Subtarget.hasAVX()) {
1802 // Although this isn't a well-supported type for AVX1, we'll let
1803 // legalization and shuffle lowering produce the optimal codegen. If we
1804 // choose an optimal type with a vector element larger than a byte,
1805 // getMemsetStores() may create an intermediate splat (using an integer
1806 // multiply) before we splat as a vector.
1809 if (Subtarget.hasSSE2())
1811 // TODO: Can SSE1 handle a byte vector?
1812 if (Subtarget.hasSSE1())
1814 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1815 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1816 // Do not use f64 to lower memcpy if source is string constant. It's
1817 // better to use i32 to avoid the loads.
1818 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1819 // The gymnastics of splatting a byte value into an XMM register and then
1820 // only using 8-byte stores (because this is a CPU with slow unaligned
1821 // 16-byte accesses) makes that a loser.
1825 // This is a compromise. If we reach here, unaligned accesses may be slow on
1826 // this target. However, creating smaller, aligned accesses could be even
1827 // slower and would certainly be a lot more code.
1828 if (Subtarget.is64Bit() && Size >= 8)
1833 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1835 return X86ScalarSSEf32;
1836 else if (VT == MVT::f64)
1837 return X86ScalarSSEf64;
1842 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1847 switch (VT.getSizeInBits()) {
1849 // 8-byte and under are always assumed to be fast.
1853 *Fast = !Subtarget.isUnalignedMem16Slow();
1856 *Fast = !Subtarget.isUnalignedMem32Slow();
1858 // TODO: What about AVX-512 (512-bit) accesses?
1861 // Misaligned accesses of any size are always allowed.
1865 /// Return the entry encoding for a jump table in the
1866 /// current function. The returned value is a member of the
1867 /// MachineJumpTableInfo::JTEntryKind enum.
1868 unsigned X86TargetLowering::getJumpTableEncoding() const {
1869 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1871 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1872 return MachineJumpTableInfo::EK_Custom32;
1874 // Otherwise, use the normal jump table encoding heuristics.
1875 return TargetLowering::getJumpTableEncoding();
1878 bool X86TargetLowering::useSoftFloat() const {
1879 return Subtarget.useSoftFloat();
1882 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1883 ArgListTy &Args) const {
1885 // Only relabel X86-32 for C / Stdcall CCs.
1886 if (Subtarget.is64Bit())
1888 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1890 unsigned ParamRegs = 0;
1891 if (auto *M = MF->getFunction()->getParent())
1892 ParamRegs = M->getNumberRegisterParameters();
1894 // Mark the first N int arguments as having reg
1895 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1896 Type *T = Args[Idx].Ty;
1897 if (T->isPointerTy() || T->isIntegerTy())
1898 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1899 unsigned numRegs = 1;
1900 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1902 if (ParamRegs < numRegs)
1904 ParamRegs -= numRegs;
1905 Args[Idx].IsInReg = true;
1911 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1912 const MachineBasicBlock *MBB,
1913 unsigned uid,MCContext &Ctx) const{
1914 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1915 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1917 return MCSymbolRefExpr::create(MBB->getSymbol(),
1918 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1921 /// Returns relocation base for the given PIC jumptable.
1922 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1923 SelectionDAG &DAG) const {
1924 if (!Subtarget.is64Bit())
1925 // This doesn't have SDLoc associated with it, but is not really the
1926 // same as a Register.
1927 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1928 getPointerTy(DAG.getDataLayout()));
1932 /// This returns the relocation base for the given PIC jumptable,
1933 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1934 const MCExpr *X86TargetLowering::
1935 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1936 MCContext &Ctx) const {
1937 // X86-64 uses RIP relative addressing based on the jump table label.
1938 if (Subtarget.isPICStyleRIPRel())
1939 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1941 // Otherwise, the reference is relative to the PIC base.
1942 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1945 std::pair<const TargetRegisterClass *, uint8_t>
1946 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1948 const TargetRegisterClass *RRC = nullptr;
1950 switch (VT.SimpleTy) {
1952 return TargetLowering::findRepresentativeClass(TRI, VT);
1953 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1954 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1957 RRC = &X86::VR64RegClass;
1959 case MVT::f32: case MVT::f64:
1960 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1961 case MVT::v4f32: case MVT::v2f64:
1962 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1963 case MVT::v8f32: case MVT::v4f64:
1964 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1965 case MVT::v16f32: case MVT::v8f64:
1966 RRC = &X86::VR128XRegClass;
1969 return std::make_pair(RRC, Cost);
1972 unsigned X86TargetLowering::getAddressSpace() const {
1973 if (Subtarget.is64Bit())
1974 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1978 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
1979 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
1980 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
1983 static Constant* SegmentOffset(IRBuilder<> &IRB,
1984 unsigned Offset, unsigned AddressSpace) {
1985 return ConstantExpr::getIntToPtr(
1986 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
1987 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
1990 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
1991 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
1992 // tcbhead_t; use it instead of the usual global variable (see
1993 // sysdeps/{i386,x86_64}/nptl/tls.h)
1994 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
1995 if (Subtarget.isTargetFuchsia()) {
1996 // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
1997 return SegmentOffset(IRB, 0x10, getAddressSpace());
1999 // %fs:0x28, unless we're using a Kernel code model, in which case
2000 // it's %gs:0x28. gs:0x14 on i386.
2001 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2002 return SegmentOffset(IRB, Offset, getAddressSpace());
2006 return TargetLowering::getIRStackGuard(IRB);
2009 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2010 // MSVC CRT provides functionalities for stack protection.
2011 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2012 // MSVC CRT has a global variable holding security cookie.
2013 M.getOrInsertGlobal("__security_cookie",
2014 Type::getInt8PtrTy(M.getContext()));
2016 // MSVC CRT has a function to validate security cookie.
2017 auto *SecurityCheckCookie = cast<Function>(
2018 M.getOrInsertFunction("__security_check_cookie",
2019 Type::getVoidTy(M.getContext()),
2020 Type::getInt8PtrTy(M.getContext())));
2021 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2022 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2025 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2026 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2028 TargetLowering::insertSSPDeclarations(M);
2031 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2032 // MSVC CRT has a global variable holding security cookie.
2033 if (Subtarget.getTargetTriple().isOSMSVCRT())
2034 return M.getGlobalVariable("__security_cookie");
2035 return TargetLowering::getSDagStackGuard(M);
2038 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2039 // MSVC CRT has a function to validate security cookie.
2040 if (Subtarget.getTargetTriple().isOSMSVCRT())
2041 return M.getFunction("__security_check_cookie");
2042 return TargetLowering::getSSPStackGuardCheck(M);
2045 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2046 if (Subtarget.getTargetTriple().isOSContiki())
2047 return getDefaultSafeStackPointerLocation(IRB, false);
2049 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2050 // definition of TLS_SLOT_SAFESTACK in
2051 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2052 if (Subtarget.isTargetAndroid()) {
2053 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2055 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2056 return SegmentOffset(IRB, Offset, getAddressSpace());
2059 // Fuchsia is similar.
2060 if (Subtarget.isTargetFuchsia()) {
2061 // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2062 return SegmentOffset(IRB, 0x18, getAddressSpace());
2065 return TargetLowering::getSafeStackPointerLocation(IRB);
2068 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2069 unsigned DestAS) const {
2070 assert(SrcAS != DestAS && "Expected different address spaces!");
2072 return SrcAS < 256 && DestAS < 256;
2075 //===----------------------------------------------------------------------===//
2076 // Return Value Calling Convention Implementation
2077 //===----------------------------------------------------------------------===//
2079 #include "X86GenCallingConv.inc"
2081 bool X86TargetLowering::CanLowerReturn(
2082 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2083 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2084 SmallVector<CCValAssign, 16> RVLocs;
2085 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2086 return CCInfo.CheckReturn(Outs, RetCC_X86);
2089 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2090 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2094 /// Lowers masks values (v*i1) to the local register values
2095 /// \returns DAG node after lowering to register type
2096 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2097 const SDLoc &Dl, SelectionDAG &DAG) {
2098 EVT ValVT = ValArg.getValueType();
2100 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2101 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2102 // Two stage lowering might be required
2103 // bitcast: v8i1 -> i8 / v16i1 -> i16
2104 // anyextend: i8 -> i32 / i16 -> i32
2105 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2106 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2107 if (ValLoc == MVT::i32)
2108 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2110 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2111 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2112 // One stage lowering is required
2113 // bitcast: v32i1 -> i32 / v64i1 -> i64
2114 return DAG.getBitcast(ValLoc, ValArg);
2116 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2119 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2120 static void Passv64i1ArgInRegs(
2121 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2122 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2123 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2124 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2125 "Expected AVX512BW or AVX512BMI target!");
2126 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2127 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2128 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2129 "The value should reside in two registers");
2131 // Before splitting the value we cast it to i64
2132 Arg = DAG.getBitcast(MVT::i64, Arg);
2134 // Splitting the value into two i32 types
2136 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2137 DAG.getConstant(0, Dl, MVT::i32));
2138 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2139 DAG.getConstant(1, Dl, MVT::i32));
2141 // Attach the two i32 types into corresponding registers
2142 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2143 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2147 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2149 const SmallVectorImpl<ISD::OutputArg> &Outs,
2150 const SmallVectorImpl<SDValue> &OutVals,
2151 const SDLoc &dl, SelectionDAG &DAG) const {
2152 MachineFunction &MF = DAG.getMachineFunction();
2153 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2155 // In some cases we need to disable registers from the default CSR list.
2156 // For example, when they are used for argument passing.
2157 bool ShouldDisableCalleeSavedRegister =
2158 CallConv == CallingConv::X86_RegCall ||
2159 MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2161 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2162 report_fatal_error("X86 interrupts may not return any value");
2164 SmallVector<CCValAssign, 16> RVLocs;
2165 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2166 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2169 SmallVector<SDValue, 6> RetOps;
2170 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2171 // Operand #1 = Bytes To Pop
2172 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2175 // Copy the result values into the output registers.
2176 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2178 CCValAssign &VA = RVLocs[I];
2179 assert(VA.isRegLoc() && "Can only return in registers!");
2181 // Add the register to the CalleeSaveDisableRegs list.
2182 if (ShouldDisableCalleeSavedRegister)
2183 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2185 SDValue ValToCopy = OutVals[OutsIndex];
2186 EVT ValVT = ValToCopy.getValueType();
2188 // Promote values to the appropriate types.
2189 if (VA.getLocInfo() == CCValAssign::SExt)
2190 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2191 else if (VA.getLocInfo() == CCValAssign::ZExt)
2192 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2193 else if (VA.getLocInfo() == CCValAssign::AExt) {
2194 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2195 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2197 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2199 else if (VA.getLocInfo() == CCValAssign::BCvt)
2200 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2202 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2203 "Unexpected FP-extend for return value.");
2205 // If this is x86-64, and we disabled SSE, we can't return FP values,
2206 // or SSE or MMX vectors.
2207 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2208 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2209 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2210 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2211 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2212 } else if (ValVT == MVT::f64 &&
2213 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2214 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2215 // llvm-gcc has never done it right and no one has noticed, so this
2216 // should be OK for now.
2217 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2218 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2221 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2222 // the RET instruction and handled by the FP Stackifier.
2223 if (VA.getLocReg() == X86::FP0 ||
2224 VA.getLocReg() == X86::FP1) {
2225 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2226 // change the value to the FP stack register class.
2227 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2228 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2229 RetOps.push_back(ValToCopy);
2230 // Don't emit a copytoreg.
2234 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2235 // which is returned in RAX / RDX.
2236 if (Subtarget.is64Bit()) {
2237 if (ValVT == MVT::x86mmx) {
2238 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2239 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2240 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2242 // If we don't have SSE2 available, convert to v4f32 so the generated
2243 // register is legal.
2244 if (!Subtarget.hasSSE2())
2245 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2250 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2252 if (VA.needsCustom()) {
2253 assert(VA.getValVT() == MVT::v64i1 &&
2254 "Currently the only custom case is when we split v64i1 to 2 regs");
2256 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2259 assert(2 == RegsToPass.size() &&
2260 "Expecting two registers after Pass64BitArgInRegs");
2262 // Add the second register to the CalleeSaveDisableRegs list.
2263 if (ShouldDisableCalleeSavedRegister)
2264 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2266 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2269 // Add nodes to the DAG and add the values into the RetOps list
2270 for (auto &Reg : RegsToPass) {
2271 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2272 Flag = Chain.getValue(1);
2273 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2277 // Swift calling convention does not require we copy the sret argument
2278 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2280 // All x86 ABIs require that for returning structs by value we copy
2281 // the sret argument into %rax/%eax (depending on ABI) for the return.
2282 // We saved the argument into a virtual register in the entry block,
2283 // so now we copy the value out and into %rax/%eax.
2285 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2286 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2287 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2288 // either case FuncInfo->setSRetReturnReg() will have been called.
2289 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2290 // When we have both sret and another return value, we should use the
2291 // original Chain stored in RetOps[0], instead of the current Chain updated
2292 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2294 // For the case of sret and another return value, we have
2295 // Chain_0 at the function entry
2296 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2297 // If we use Chain_1 in getCopyFromReg, we will have
2298 // Val = getCopyFromReg(Chain_1)
2299 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2301 // getCopyToReg(Chain_0) will be glued together with
2302 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2303 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2304 // Data dependency from Unit B to Unit A due to usage of Val in
2305 // getCopyToReg(Chain_1, Val)
2306 // Chain dependency from Unit A to Unit B
2308 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2309 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2310 getPointerTy(MF.getDataLayout()));
2313 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2314 X86::RAX : X86::EAX;
2315 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2316 Flag = Chain.getValue(1);
2318 // RAX/EAX now acts like a return value.
2320 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2322 // Add the returned register to the CalleeSaveDisableRegs list.
2323 if (ShouldDisableCalleeSavedRegister)
2324 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2327 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2328 const MCPhysReg *I =
2329 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2332 if (X86::GR64RegClass.contains(*I))
2333 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2335 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2339 RetOps[0] = Chain; // Update chain.
2341 // Add the flag if we have it.
2343 RetOps.push_back(Flag);
2345 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2346 if (CallConv == CallingConv::X86_INTR)
2347 opcode = X86ISD::IRET;
2348 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2351 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2352 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2355 SDValue TCChain = Chain;
2356 SDNode *Copy = *N->use_begin();
2357 if (Copy->getOpcode() == ISD::CopyToReg) {
2358 // If the copy has a glue operand, we conservatively assume it isn't safe to
2359 // perform a tail call.
2360 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2362 TCChain = Copy->getOperand(0);
2363 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2366 bool HasRet = false;
2367 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2369 if (UI->getOpcode() != X86ISD::RET_FLAG)
2371 // If we are returning more than one value, we can definitely
2372 // not make a tail call see PR19530
2373 if (UI->getNumOperands() > 4)
2375 if (UI->getNumOperands() == 4 &&
2376 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2388 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2389 ISD::NodeType ExtendKind) const {
2390 MVT ReturnMVT = MVT::i32;
2392 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2393 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2394 // The ABI does not require i1, i8 or i16 to be extended.
2396 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2397 // always extending i8/i16 return values, so keep doing that for now.
2399 ReturnMVT = MVT::i8;
2402 EVT MinVT = getRegisterType(Context, ReturnMVT);
2403 return VT.bitsLT(MinVT) ? MinVT : VT;
2406 /// Reads two 32 bit registers and creates a 64 bit mask value.
2407 /// \param VA The current 32 bit value that need to be assigned.
2408 /// \param NextVA The next 32 bit value that need to be assigned.
2409 /// \param Root The parent DAG node.
2410 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2411 /// glue purposes. In the case the DAG is already using
2412 /// physical register instead of virtual, we should glue
2413 /// our new SDValue to InFlag SDvalue.
2414 /// \return a new SDvalue of size 64bit.
2415 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2416 SDValue &Root, SelectionDAG &DAG,
2417 const SDLoc &Dl, const X86Subtarget &Subtarget,
2418 SDValue *InFlag = nullptr) {
2419 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2420 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2421 assert(VA.getValVT() == MVT::v64i1 &&
2422 "Expecting first location of 64 bit width type");
2423 assert(NextVA.getValVT() == VA.getValVT() &&
2424 "The locations should have the same type");
2425 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2426 "The values should reside in two registers");
2430 SDValue ArgValueLo, ArgValueHi;
2432 MachineFunction &MF = DAG.getMachineFunction();
2433 const TargetRegisterClass *RC = &X86::GR32RegClass;
2435 // Read a 32 bit value from the registers
2436 if (nullptr == InFlag) {
2437 // When no physical register is present,
2438 // create an intermediate virtual register
2439 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2440 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2441 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2442 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2444 // When a physical register is available read the value from it and glue
2445 // the reads together.
2447 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2448 *InFlag = ArgValueLo.getValue(2);
2450 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2451 *InFlag = ArgValueHi.getValue(2);
2454 // Convert the i32 type into v32i1 type
2455 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2457 // Convert the i32 type into v32i1 type
2458 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2460 // Concatenate the two values together
2461 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2464 /// The function will lower a register of various sizes (8/16/32/64)
2465 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2466 /// \returns a DAG node contains the operand after lowering to mask type.
2467 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2468 const EVT &ValLoc, const SDLoc &Dl,
2469 SelectionDAG &DAG) {
2470 SDValue ValReturned = ValArg;
2472 if (ValVT == MVT::v1i1)
2473 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2475 if (ValVT == MVT::v64i1) {
2476 // In 32 bit machine, this case is handled by getv64i1Argument
2477 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2478 // In 64 bit machine, There is no need to truncate the value only bitcast
2481 switch (ValVT.getSimpleVT().SimpleTy) {
2492 llvm_unreachable("Expecting a vector of i1 types");
2495 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2497 return DAG.getBitcast(ValVT, ValReturned);
2500 /// Lower the result values of a call into the
2501 /// appropriate copies out of appropriate physical registers.
2503 SDValue X86TargetLowering::LowerCallResult(
2504 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2505 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2506 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2507 uint32_t *RegMask) const {
2509 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2510 // Assign locations to each value returned by this call.
2511 SmallVector<CCValAssign, 16> RVLocs;
2512 bool Is64Bit = Subtarget.is64Bit();
2513 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2515 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2517 // Copy all of the result registers out of their specified physreg.
2518 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2520 CCValAssign &VA = RVLocs[I];
2521 EVT CopyVT = VA.getLocVT();
2523 // In some calling conventions we need to remove the used registers
2524 // from the register mask.
2526 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2527 SubRegs.isValid(); ++SubRegs)
2528 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2531 // If this is x86-64, and we disabled SSE, we can't return FP values
2532 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2533 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2534 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2535 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2538 // If we prefer to use the value in xmm registers, copy it out as f80 and
2539 // use a truncate to move it from fp stack reg to xmm reg.
2540 bool RoundAfterCopy = false;
2541 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2542 isScalarFPTypeInSSEReg(VA.getValVT())) {
2543 if (!Subtarget.hasX87())
2544 report_fatal_error("X87 register return with X87 disabled");
2546 RoundAfterCopy = (CopyVT != VA.getLocVT());
2550 if (VA.needsCustom()) {
2551 assert(VA.getValVT() == MVT::v64i1 &&
2552 "Currently the only custom case is when we split v64i1 to 2 regs");
2554 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2556 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2558 Val = Chain.getValue(0);
2559 InFlag = Chain.getValue(2);
2563 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2564 // This truncation won't change the value.
2565 DAG.getIntPtrConstant(1, dl));
2567 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2568 if (VA.getValVT().isVector() &&
2569 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2570 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2571 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2572 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2574 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2577 InVals.push_back(Val);
2583 //===----------------------------------------------------------------------===//
2584 // C & StdCall & Fast Calling Convention implementation
2585 //===----------------------------------------------------------------------===//
2586 // StdCall calling convention seems to be standard for many Windows' API
2587 // routines and around. It differs from C calling convention just a little:
2588 // callee should clean up the stack, not caller. Symbols should be also
2589 // decorated in some fancy way :) It doesn't support any vector arguments.
2590 // For info on fast calling convention see Fast Calling Convention (tail call)
2591 // implementation LowerX86_32FastCCCallTo.
2593 /// CallIsStructReturn - Determines whether a call uses struct return
2595 enum StructReturnType {
2600 static StructReturnType
2601 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2603 return NotStructReturn;
2605 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2606 if (!Flags.isSRet())
2607 return NotStructReturn;
2608 if (Flags.isInReg() || IsMCU)
2609 return RegStructReturn;
2610 return StackStructReturn;
2613 /// Determines whether a function uses struct return semantics.
2614 static StructReturnType
2615 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2617 return NotStructReturn;
2619 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2620 if (!Flags.isSRet())
2621 return NotStructReturn;
2622 if (Flags.isInReg() || IsMCU)
2623 return RegStructReturn;
2624 return StackStructReturn;
2627 /// Make a copy of an aggregate at address specified by "Src" to address
2628 /// "Dst" with size and alignment information specified by the specific
2629 /// parameter attribute. The copy will be passed as a byval function parameter.
2630 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2631 SDValue Chain, ISD::ArgFlagsTy Flags,
2632 SelectionDAG &DAG, const SDLoc &dl) {
2633 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2635 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2636 /*isVolatile*/false, /*AlwaysInline=*/true,
2637 /*isTailCall*/false,
2638 MachinePointerInfo(), MachinePointerInfo());
2641 /// Return true if the calling convention is one that we can guarantee TCO for.
2642 static bool canGuaranteeTCO(CallingConv::ID CC) {
2643 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2644 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2645 CC == CallingConv::HHVM);
2648 /// Return true if we might ever do TCO for calls with this calling convention.
2649 static bool mayTailCallThisCC(CallingConv::ID CC) {
2651 // C calling conventions:
2652 case CallingConv::C:
2653 case CallingConv::X86_64_Win64:
2654 case CallingConv::X86_64_SysV:
2655 // Callee pop conventions:
2656 case CallingConv::X86_ThisCall:
2657 case CallingConv::X86_StdCall:
2658 case CallingConv::X86_VectorCall:
2659 case CallingConv::X86_FastCall:
2662 return canGuaranteeTCO(CC);
2666 /// Return true if the function is being made into a tailcall target by
2667 /// changing its ABI.
2668 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2669 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2672 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2674 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2675 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2678 ImmutableCallSite CS(CI);
2679 CallingConv::ID CalleeCC = CS.getCallingConv();
2680 if (!mayTailCallThisCC(CalleeCC))
2687 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2688 const SmallVectorImpl<ISD::InputArg> &Ins,
2689 const SDLoc &dl, SelectionDAG &DAG,
2690 const CCValAssign &VA,
2691 MachineFrameInfo &MFI, unsigned i) const {
2692 // Create the nodes corresponding to a load from this parameter slot.
2693 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2694 bool AlwaysUseMutable = shouldGuaranteeTCO(
2695 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2696 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2698 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2700 // If value is passed by pointer we have address passed instead of the value
2701 // itself. No need to extend if the mask value and location share the same
2703 bool ExtendedInMem =
2704 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2705 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2707 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2708 ValVT = VA.getLocVT();
2710 ValVT = VA.getValVT();
2712 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2713 // taken by a return address.
2715 if (CallConv == CallingConv::X86_INTR) {
2716 // X86 interrupts may take one or two arguments.
2717 // On the stack there will be no return address as in regular call.
2718 // Offset of last argument need to be set to -4/-8 bytes.
2719 // Where offset of the first argument out of two, should be set to 0 bytes.
2720 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2721 if (Subtarget.is64Bit() && Ins.size() == 2) {
2722 // The stack pointer needs to be realigned for 64 bit handlers with error
2723 // code, so the argument offset changes by 8 bytes.
2728 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2729 // changed with more analysis.
2730 // In case of tail call optimization mark all arguments mutable. Since they
2731 // could be overwritten by lowering of arguments in case of a tail call.
2732 if (Flags.isByVal()) {
2733 unsigned Bytes = Flags.getByValSize();
2734 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2735 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2736 // Adjust SP offset of interrupt parameter.
2737 if (CallConv == CallingConv::X86_INTR) {
2738 MFI.setObjectOffset(FI, Offset);
2740 return DAG.getFrameIndex(FI, PtrVT);
2743 // This is an argument in memory. We might be able to perform copy elision.
2744 if (Flags.isCopyElisionCandidate()) {
2745 EVT ArgVT = Ins[i].ArgVT;
2747 if (Ins[i].PartOffset == 0) {
2748 // If this is a one-part value or the first part of a multi-part value,
2749 // create a stack object for the entire argument value type and return a
2750 // load from our portion of it. This assumes that if the first part of an
2751 // argument is in memory, the rest will also be in memory.
2752 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2753 /*Immutable=*/false);
2754 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2756 ValVT, dl, Chain, PartAddr,
2757 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2759 // This is not the first piece of an argument in memory. See if there is
2760 // already a fixed stack object including this offset. If so, assume it
2761 // was created by the PartOffset == 0 branch above and create a load from
2762 // the appropriate offset into it.
2763 int64_t PartBegin = VA.getLocMemOffset();
2764 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2765 int FI = MFI.getObjectIndexBegin();
2766 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2767 int64_t ObjBegin = MFI.getObjectOffset(FI);
2768 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2769 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2772 if (MFI.isFixedObjectIndex(FI)) {
2774 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2775 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2777 ValVT, dl, Chain, Addr,
2778 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2779 Ins[i].PartOffset));
2784 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2785 VA.getLocMemOffset(), isImmutable);
2787 // Set SExt or ZExt flag.
2788 if (VA.getLocInfo() == CCValAssign::ZExt) {
2789 MFI.setObjectZExt(FI, true);
2790 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2791 MFI.setObjectSExt(FI, true);
2794 // Adjust SP offset of interrupt parameter.
2795 if (CallConv == CallingConv::X86_INTR) {
2796 MFI.setObjectOffset(FI, Offset);
2799 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2800 SDValue Val = DAG.getLoad(
2801 ValVT, dl, Chain, FIN,
2802 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2803 return ExtendedInMem
2804 ? (VA.getValVT().isVector()
2805 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2806 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2810 // FIXME: Get this from tablegen.
2811 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2812 const X86Subtarget &Subtarget) {
2813 assert(Subtarget.is64Bit());
2815 if (Subtarget.isCallingConvWin64(CallConv)) {
2816 static const MCPhysReg GPR64ArgRegsWin64[] = {
2817 X86::RCX, X86::RDX, X86::R8, X86::R9
2819 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2822 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2823 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2825 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2828 // FIXME: Get this from tablegen.
2829 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2830 CallingConv::ID CallConv,
2831 const X86Subtarget &Subtarget) {
2832 assert(Subtarget.is64Bit());
2833 if (Subtarget.isCallingConvWin64(CallConv)) {
2834 // The XMM registers which might contain var arg parameters are shadowed
2835 // in their paired GPR. So we only need to save the GPR to their home
2837 // TODO: __vectorcall will change this.
2841 const Function *Fn = MF.getFunction();
2842 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2843 bool isSoftFloat = Subtarget.useSoftFloat();
2844 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2845 "SSE register cannot be used when SSE is disabled!");
2846 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2847 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2851 static const MCPhysReg XMMArgRegs64Bit[] = {
2852 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2853 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2855 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2859 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2860 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2861 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2862 return A.getValNo() < B.getValNo();
2867 SDValue X86TargetLowering::LowerFormalArguments(
2868 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2869 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2870 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2871 MachineFunction &MF = DAG.getMachineFunction();
2872 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2873 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2875 const Function *Fn = MF.getFunction();
2876 if (Fn->hasExternalLinkage() &&
2877 Subtarget.isTargetCygMing() &&
2878 Fn->getName() == "main")
2879 FuncInfo->setForceFramePointer(true);
2881 MachineFrameInfo &MFI = MF.getFrameInfo();
2882 bool Is64Bit = Subtarget.is64Bit();
2883 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2886 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2887 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2889 if (CallConv == CallingConv::X86_INTR) {
2890 bool isLegal = Ins.size() == 1 ||
2891 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2892 (!Is64Bit && Ins[1].VT == MVT::i32)));
2894 report_fatal_error("X86 interrupts may take one or two arguments");
2897 // Assign locations to all of the incoming arguments.
2898 SmallVector<CCValAssign, 16> ArgLocs;
2899 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2901 // Allocate shadow area for Win64.
2903 CCInfo.AllocateStack(32, 8);
2905 CCInfo.AnalyzeArguments(Ins, CC_X86);
2907 // In vectorcall calling convention a second pass is required for the HVA
2909 if (CallingConv::X86_VectorCall == CallConv) {
2910 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2913 // The next loop assumes that the locations are in the same order of the
2915 assert(isSortedByValueNo(ArgLocs) &&
2916 "Argument Location list must be sorted before lowering");
2919 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2921 assert(InsIndex < Ins.size() && "Invalid Ins index");
2922 CCValAssign &VA = ArgLocs[I];
2924 if (VA.isRegLoc()) {
2925 EVT RegVT = VA.getLocVT();
2926 if (VA.needsCustom()) {
2928 VA.getValVT() == MVT::v64i1 &&
2929 "Currently the only custom case is when we split v64i1 to 2 regs");
2931 // v64i1 values, in regcall calling convention, that are
2932 // compiled to 32 bit arch, are split up into two registers.
2934 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2936 const TargetRegisterClass *RC;
2937 if (RegVT == MVT::i32)
2938 RC = &X86::GR32RegClass;
2939 else if (Is64Bit && RegVT == MVT::i64)
2940 RC = &X86::GR64RegClass;
2941 else if (RegVT == MVT::f32)
2942 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2943 else if (RegVT == MVT::f64)
2944 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2945 else if (RegVT == MVT::f80)
2946 RC = &X86::RFP80RegClass;
2947 else if (RegVT == MVT::f128)
2948 RC = &X86::FR128RegClass;
2949 else if (RegVT.is512BitVector())
2950 RC = &X86::VR512RegClass;
2951 else if (RegVT.is256BitVector())
2952 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2953 else if (RegVT.is128BitVector())
2954 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2955 else if (RegVT == MVT::x86mmx)
2956 RC = &X86::VR64RegClass;
2957 else if (RegVT == MVT::v1i1)
2958 RC = &X86::VK1RegClass;
2959 else if (RegVT == MVT::v8i1)
2960 RC = &X86::VK8RegClass;
2961 else if (RegVT == MVT::v16i1)
2962 RC = &X86::VK16RegClass;
2963 else if (RegVT == MVT::v32i1)
2964 RC = &X86::VK32RegClass;
2965 else if (RegVT == MVT::v64i1)
2966 RC = &X86::VK64RegClass;
2968 llvm_unreachable("Unknown argument type!");
2970 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2971 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2974 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2975 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2977 if (VA.getLocInfo() == CCValAssign::SExt)
2978 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2979 DAG.getValueType(VA.getValVT()));
2980 else if (VA.getLocInfo() == CCValAssign::ZExt)
2981 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2982 DAG.getValueType(VA.getValVT()));
2983 else if (VA.getLocInfo() == CCValAssign::BCvt)
2984 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2986 if (VA.isExtInLoc()) {
2987 // Handle MMX values passed in XMM regs.
2988 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2989 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2990 else if (VA.getValVT().isVector() &&
2991 VA.getValVT().getScalarType() == MVT::i1 &&
2992 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2993 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2994 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2995 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
2997 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3000 assert(VA.isMemLoc());
3002 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3005 // If value is passed via pointer - do a load.
3006 if (VA.getLocInfo() == CCValAssign::Indirect)
3008 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3010 InVals.push_back(ArgValue);
3013 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3014 // Swift calling convention does not require we copy the sret argument
3015 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3016 if (CallConv == CallingConv::Swift)
3019 // All x86 ABIs require that for returning structs by value we copy the
3020 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3021 // the argument into a virtual register so that we can access it from the
3023 if (Ins[I].Flags.isSRet()) {
3024 unsigned Reg = FuncInfo->getSRetReturnReg();
3026 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3027 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3028 FuncInfo->setSRetReturnReg(Reg);
3030 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3031 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3036 unsigned StackSize = CCInfo.getNextStackOffset();
3037 // Align stack specially for tail calls.
3038 if (shouldGuaranteeTCO(CallConv,
3039 MF.getTarget().Options.GuaranteedTailCallOpt))
3040 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3042 // If the function takes variable number of arguments, make a frame index for
3043 // the start of the first vararg value... for expansion of llvm.va_start. We
3044 // can skip this if there are no va_start calls.
3045 if (MFI.hasVAStart() &&
3046 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3047 CallConv != CallingConv::X86_ThisCall))) {
3048 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3051 // Figure out if XMM registers are in use.
3052 assert(!(Subtarget.useSoftFloat() &&
3053 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3054 "SSE register cannot be used when SSE is disabled!");
3056 // 64-bit calling conventions support varargs and register parameters, so we
3057 // have to do extra work to spill them in the prologue.
3058 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3059 // Find the first unallocated argument registers.
3060 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3061 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3062 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3063 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3064 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3065 "SSE register cannot be used when SSE is disabled!");
3067 // Gather all the live in physical registers.
3068 SmallVector<SDValue, 6> LiveGPRs;
3069 SmallVector<SDValue, 8> LiveXMMRegs;
3071 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3072 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3074 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3076 if (!ArgXMMs.empty()) {
3077 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3078 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3079 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3080 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3081 LiveXMMRegs.push_back(
3082 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3087 // Get to the caller-allocated home save location. Add 8 to account
3088 // for the return address.
3089 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3090 FuncInfo->setRegSaveFrameIndex(
3091 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3092 // Fixup to set vararg frame on shadow area (4 x i64).
3094 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3096 // For X86-64, if there are vararg parameters that are passed via
3097 // registers, then we must store them to their spots on the stack so
3098 // they may be loaded by dereferencing the result of va_next.
3099 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3100 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3101 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3102 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3105 // Store the integer parameter registers.
3106 SmallVector<SDValue, 8> MemOps;
3107 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3108 getPointerTy(DAG.getDataLayout()));
3109 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3110 for (SDValue Val : LiveGPRs) {
3111 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3112 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3114 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3115 MachinePointerInfo::getFixedStack(
3116 DAG.getMachineFunction(),
3117 FuncInfo->getRegSaveFrameIndex(), Offset));
3118 MemOps.push_back(Store);
3122 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3123 // Now store the XMM (fp + vector) parameter registers.
3124 SmallVector<SDValue, 12> SaveXMMOps;
3125 SaveXMMOps.push_back(Chain);
3126 SaveXMMOps.push_back(ALVal);
3127 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3128 FuncInfo->getRegSaveFrameIndex(), dl));
3129 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3130 FuncInfo->getVarArgsFPOffset(), dl));
3131 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3133 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3134 MVT::Other, SaveXMMOps));
3137 if (!MemOps.empty())
3138 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3141 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3142 // Find the largest legal vector type.
3143 MVT VecVT = MVT::Other;
3144 // FIXME: Only some x86_32 calling conventions support AVX512.
3145 if (Subtarget.hasAVX512() &&
3146 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3147 CallConv == CallingConv::Intel_OCL_BI)))
3148 VecVT = MVT::v16f32;
3149 else if (Subtarget.hasAVX())
3151 else if (Subtarget.hasSSE2())
3154 // We forward some GPRs and some vector types.
3155 SmallVector<MVT, 2> RegParmTypes;
3156 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3157 RegParmTypes.push_back(IntVT);
3158 if (VecVT != MVT::Other)
3159 RegParmTypes.push_back(VecVT);
3161 // Compute the set of forwarded registers. The rest are scratch.
3162 SmallVectorImpl<ForwardedRegister> &Forwards =
3163 FuncInfo->getForwardedMustTailRegParms();
3164 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3166 // Conservatively forward AL on x86_64, since it might be used for varargs.
3167 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3168 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3169 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3172 // Copy all forwards from physical to virtual registers.
3173 for (ForwardedRegister &F : Forwards) {
3174 // FIXME: Can we use a less constrained schedule?
3175 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3176 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3177 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3181 // Some CCs need callee pop.
3182 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3183 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3184 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3185 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3186 // X86 interrupts must pop the error code (and the alignment padding) if
3188 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3190 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3191 // If this is an sret function, the return should pop the hidden pointer.
3192 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3193 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3194 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3195 FuncInfo->setBytesToPopOnReturn(4);
3199 // RegSaveFrameIndex is X86-64 only.
3200 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3201 if (CallConv == CallingConv::X86_FastCall ||
3202 CallConv == CallingConv::X86_ThisCall)
3203 // fastcc functions can't have varargs.
3204 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3207 FuncInfo->setArgumentStackSize(StackSize);
3209 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3210 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3211 if (Personality == EHPersonality::CoreCLR) {
3213 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3214 // that we'd prefer this slot be allocated towards the bottom of the frame
3215 // (i.e. near the stack pointer after allocating the frame). Every
3216 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3217 // offset from the bottom of this and each funclet's frame must be the
3218 // same, so the size of funclets' (mostly empty) frames is dictated by
3219 // how far this slot is from the bottom (since they allocate just enough
3220 // space to accommodate holding this slot at the correct offset).
3221 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3222 EHInfo->PSPSymFrameIdx = PSPSymFI;
3226 if (CallConv == CallingConv::X86_RegCall ||
3227 Fn->hasFnAttribute("no_caller_saved_registers")) {
3228 const MachineRegisterInfo &MRI = MF.getRegInfo();
3229 for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3230 MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3236 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3237 SDValue Arg, const SDLoc &dl,
3239 const CCValAssign &VA,
3240 ISD::ArgFlagsTy Flags) const {
3241 unsigned LocMemOffset = VA.getLocMemOffset();
3242 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3243 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3245 if (Flags.isByVal())
3246 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3248 return DAG.getStore(
3249 Chain, dl, Arg, PtrOff,
3250 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3253 /// Emit a load of return address if tail call
3254 /// optimization is performed and it is required.
3255 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3256 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3257 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3258 // Adjust the Return address stack slot.
3259 EVT VT = getPointerTy(DAG.getDataLayout());
3260 OutRetAddr = getReturnAddressFrameIndex(DAG);
3262 // Load the "old" Return address.
3263 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3264 return SDValue(OutRetAddr.getNode(), 1);
3267 /// Emit a store of the return address if tail call
3268 /// optimization is performed and it is required (FPDiff!=0).
3269 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3270 SDValue Chain, SDValue RetAddrFrIdx,
3271 EVT PtrVT, unsigned SlotSize,
3272 int FPDiff, const SDLoc &dl) {
3273 // Store the return address to the appropriate stack slot.
3274 if (!FPDiff) return Chain;
3275 // Calculate the new stack slot for the return address.
3276 int NewReturnAddrFI =
3277 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3279 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3280 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3281 MachinePointerInfo::getFixedStack(
3282 DAG.getMachineFunction(), NewReturnAddrFI));
3286 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3287 /// operation of specified width.
3288 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3290 unsigned NumElems = VT.getVectorNumElements();
3291 SmallVector<int, 8> Mask;
3292 Mask.push_back(NumElems);
3293 for (unsigned i = 1; i != NumElems; ++i)
3295 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3299 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3300 SmallVectorImpl<SDValue> &InVals) const {
3301 SelectionDAG &DAG = CLI.DAG;
3303 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3304 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3305 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3306 SDValue Chain = CLI.Chain;
3307 SDValue Callee = CLI.Callee;
3308 CallingConv::ID CallConv = CLI.CallConv;
3309 bool &isTailCall = CLI.IsTailCall;
3310 bool isVarArg = CLI.IsVarArg;
3312 MachineFunction &MF = DAG.getMachineFunction();
3313 bool Is64Bit = Subtarget.is64Bit();
3314 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3315 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3316 bool IsSibcall = false;
3317 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3318 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3319 const CallInst *CI =
3320 CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
3321 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3322 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3323 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3325 if (CallConv == CallingConv::X86_INTR)
3326 report_fatal_error("X86 interrupts may not be called directly");
3328 if (Attr.getValueAsString() == "true")
3331 if (Subtarget.isPICStyleGOT() &&
3332 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3333 // If we are using a GOT, disable tail calls to external symbols with
3334 // default visibility. Tail calling such a symbol requires using a GOT
3335 // relocation, which forces early binding of the symbol. This breaks code
3336 // that require lazy function symbol resolution. Using musttail or
3337 // GuaranteedTailCallOpt will override this.
3338 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3339 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3340 G->getGlobal()->hasDefaultVisibility()))
3344 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3346 // Force this to be a tail call. The verifier rules are enough to ensure
3347 // that we can lower this successfully without moving the return address
3350 } else if (isTailCall) {
3351 // Check if it's really possible to do a tail call.
3352 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3353 isVarArg, SR != NotStructReturn,
3354 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3355 Outs, OutVals, Ins, DAG);
3357 // Sibcalls are automatically detected tailcalls which do not require
3359 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3366 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3367 "Var args not supported with calling convention fastcc, ghc or hipe");
3369 // Analyze operands of the call, assigning locations to each operand.
3370 SmallVector<CCValAssign, 16> ArgLocs;
3371 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3373 // Allocate shadow area for Win64.
3375 CCInfo.AllocateStack(32, 8);
3377 CCInfo.AnalyzeArguments(Outs, CC_X86);
3379 // In vectorcall calling convention a second pass is required for the HVA
3381 if (CallingConv::X86_VectorCall == CallConv) {
3382 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3385 // Get a count of how many bytes are to be pushed on the stack.
3386 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3388 // This is a sibcall. The memory operands are available in caller's
3389 // own caller's stack.
3391 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3392 canGuaranteeTCO(CallConv))
3393 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3396 if (isTailCall && !IsSibcall && !IsMustTail) {
3397 // Lower arguments at fp - stackoffset + fpdiff.
3398 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3400 FPDiff = NumBytesCallerPushed - NumBytes;
3402 // Set the delta of movement of the returnaddr stackslot.
3403 // But only set if delta is greater than previous delta.
3404 if (FPDiff < X86Info->getTCReturnAddrDelta())
3405 X86Info->setTCReturnAddrDelta(FPDiff);
3408 unsigned NumBytesToPush = NumBytes;
3409 unsigned NumBytesToPop = NumBytes;
3411 // If we have an inalloca argument, all stack space has already been allocated
3412 // for us and be right at the top of the stack. We don't support multiple
3413 // arguments passed in memory when using inalloca.
3414 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3416 if (!ArgLocs.back().isMemLoc())
3417 report_fatal_error("cannot use inalloca attribute on a register "
3419 if (ArgLocs.back().getLocMemOffset() != 0)
3420 report_fatal_error("any parameter with the inalloca attribute must be "
3421 "the only memory argument");
3425 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3426 NumBytes - NumBytesToPush, dl);
3428 SDValue RetAddrFrIdx;
3429 // Load return address for tail calls.
3430 if (isTailCall && FPDiff)
3431 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3432 Is64Bit, FPDiff, dl);
3434 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3435 SmallVector<SDValue, 8> MemOpChains;
3438 // The next loop assumes that the locations are in the same order of the
3440 assert(isSortedByValueNo(ArgLocs) &&
3441 "Argument Location list must be sorted before lowering");
3443 // Walk the register/memloc assignments, inserting copies/loads. In the case
3444 // of tail call optimization arguments are handle later.
3445 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3446 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3448 assert(OutIndex < Outs.size() && "Invalid Out index");
3449 // Skip inalloca arguments, they have already been written.
3450 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3451 if (Flags.isInAlloca())
3454 CCValAssign &VA = ArgLocs[I];
3455 EVT RegVT = VA.getLocVT();
3456 SDValue Arg = OutVals[OutIndex];
3457 bool isByVal = Flags.isByVal();
3459 // Promote the value if needed.
3460 switch (VA.getLocInfo()) {
3461 default: llvm_unreachable("Unknown loc info!");
3462 case CCValAssign::Full: break;
3463 case CCValAssign::SExt:
3464 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3466 case CCValAssign::ZExt:
3467 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3469 case CCValAssign::AExt:
3470 if (Arg.getValueType().isVector() &&
3471 Arg.getValueType().getVectorElementType() == MVT::i1)
3472 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3473 else if (RegVT.is128BitVector()) {
3474 // Special case: passing MMX values in XMM registers.
3475 Arg = DAG.getBitcast(MVT::i64, Arg);
3476 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3477 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3479 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3481 case CCValAssign::BCvt:
3482 Arg = DAG.getBitcast(RegVT, Arg);
3484 case CCValAssign::Indirect: {
3485 // Store the argument.
3486 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3487 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3488 Chain = DAG.getStore(
3489 Chain, dl, Arg, SpillSlot,
3490 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3496 if (VA.needsCustom()) {
3497 assert(VA.getValVT() == MVT::v64i1 &&
3498 "Currently the only custom case is when we split v64i1 to 2 regs");
3499 // Split v64i1 value into two registers
3500 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3502 } else if (VA.isRegLoc()) {
3503 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3504 if (isVarArg && IsWin64) {
3505 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3506 // shadow reg if callee is a varargs function.
3507 unsigned ShadowReg = 0;
3508 switch (VA.getLocReg()) {
3509 case X86::XMM0: ShadowReg = X86::RCX; break;
3510 case X86::XMM1: ShadowReg = X86::RDX; break;
3511 case X86::XMM2: ShadowReg = X86::R8; break;
3512 case X86::XMM3: ShadowReg = X86::R9; break;
3515 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3517 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3518 assert(VA.isMemLoc());
3519 if (!StackPtr.getNode())
3520 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3521 getPointerTy(DAG.getDataLayout()));
3522 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3523 dl, DAG, VA, Flags));
3527 if (!MemOpChains.empty())
3528 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3530 if (Subtarget.isPICStyleGOT()) {
3531 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3534 RegsToPass.push_back(std::make_pair(
3535 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3536 getPointerTy(DAG.getDataLayout()))));
3538 // If we are tail calling and generating PIC/GOT style code load the
3539 // address of the callee into ECX. The value in ecx is used as target of
3540 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3541 // for tail calls on PIC/GOT architectures. Normally we would just put the
3542 // address of GOT into ebx and then call target@PLT. But for tail calls
3543 // ebx would be restored (since ebx is callee saved) before jumping to the
3546 // Note: The actual moving to ECX is done further down.
3547 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3548 if (G && !G->getGlobal()->hasLocalLinkage() &&
3549 G->getGlobal()->hasDefaultVisibility())
3550 Callee = LowerGlobalAddress(Callee, DAG);
3551 else if (isa<ExternalSymbolSDNode>(Callee))
3552 Callee = LowerExternalSymbol(Callee, DAG);
3556 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3557 // From AMD64 ABI document:
3558 // For calls that may call functions that use varargs or stdargs
3559 // (prototype-less calls or calls to functions containing ellipsis (...) in
3560 // the declaration) %al is used as hidden argument to specify the number
3561 // of SSE registers used. The contents of %al do not need to match exactly
3562 // the number of registers, but must be an ubound on the number of SSE
3563 // registers used and is in the range 0 - 8 inclusive.
3565 // Count the number of XMM registers allocated.
3566 static const MCPhysReg XMMArgRegs[] = {
3567 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3568 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3570 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3571 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3572 && "SSE registers cannot be used when SSE is disabled");
3574 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3575 DAG.getConstant(NumXMMRegs, dl,
3579 if (isVarArg && IsMustTail) {
3580 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3581 for (const auto &F : Forwards) {
3582 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3583 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3587 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3588 // don't need this because the eligibility check rejects calls that require
3589 // shuffling arguments passed in memory.
3590 if (!IsSibcall && isTailCall) {
3591 // Force all the incoming stack arguments to be loaded from the stack
3592 // before any new outgoing arguments are stored to the stack, because the
3593 // outgoing stack slots may alias the incoming argument stack slots, and
3594 // the alias isn't otherwise explicit. This is slightly more conservative
3595 // than necessary, because it means that each store effectively depends
3596 // on every argument instead of just those arguments it would clobber.
3597 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3599 SmallVector<SDValue, 8> MemOpChains2;
3602 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3604 CCValAssign &VA = ArgLocs[I];
3606 if (VA.isRegLoc()) {
3607 if (VA.needsCustom()) {
3608 assert((CallConv == CallingConv::X86_RegCall) &&
3609 "Expecting custom case only in regcall calling convention");
3610 // This means that we are in special case where one argument was
3611 // passed through two register locations - Skip the next location
3618 assert(VA.isMemLoc());
3619 SDValue Arg = OutVals[OutsIndex];
3620 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3621 // Skip inalloca arguments. They don't require any work.
3622 if (Flags.isInAlloca())
3624 // Create frame index.
3625 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3626 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3627 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3628 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3630 if (Flags.isByVal()) {
3631 // Copy relative to framepointer.
3632 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3633 if (!StackPtr.getNode())
3634 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3635 getPointerTy(DAG.getDataLayout()));
3636 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3639 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3643 // Store relative to framepointer.
3644 MemOpChains2.push_back(DAG.getStore(
3645 ArgChain, dl, Arg, FIN,
3646 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3650 if (!MemOpChains2.empty())
3651 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3653 // Store the return address to the appropriate stack slot.
3654 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3655 getPointerTy(DAG.getDataLayout()),
3656 RegInfo->getSlotSize(), FPDiff, dl);
3659 // Build a sequence of copy-to-reg nodes chained together with token chain
3660 // and flag operands which copy the outgoing args into registers.
3662 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3663 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3664 RegsToPass[i].second, InFlag);
3665 InFlag = Chain.getValue(1);
3668 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3669 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3670 // In the 64-bit large code model, we have to make all calls
3671 // through a register, since the call instruction's 32-bit
3672 // pc-relative offset may not be large enough to hold the whole
3674 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3675 // If the callee is a GlobalAddress node (quite common, every direct call
3676 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3678 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3680 // We should use extra load for direct calls to dllimported functions in
3682 const GlobalValue *GV = G->getGlobal();
3683 if (!GV->hasDLLImportStorageClass()) {
3684 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3686 Callee = DAG.getTargetGlobalAddress(
3687 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3689 if (OpFlags == X86II::MO_GOTPCREL) {
3691 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3692 getPointerTy(DAG.getDataLayout()), Callee);
3693 // Add extra indirection
3694 Callee = DAG.getLoad(
3695 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3696 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3699 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3700 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3701 unsigned char OpFlags =
3702 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3704 Callee = DAG.getTargetExternalSymbol(
3705 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3706 } else if (Subtarget.isTarget64BitILP32() &&
3707 Callee->getValueType(0) == MVT::i32) {
3708 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3709 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3712 // Returns a chain & a flag for retval copy to use.
3713 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3714 SmallVector<SDValue, 8> Ops;
3716 if (!IsSibcall && isTailCall) {
3717 Chain = DAG.getCALLSEQ_END(Chain,
3718 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3719 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3720 InFlag = Chain.getValue(1);
3723 Ops.push_back(Chain);
3724 Ops.push_back(Callee);
3727 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3729 // Add argument registers to the end of the list so that they are known live
3731 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3732 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3733 RegsToPass[i].second.getValueType()));
3735 // Add a register mask operand representing the call-preserved registers.
3736 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3737 // set X86_INTR calling convention because it has the same CSR mask
3738 // (same preserved registers).
3739 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3740 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3741 assert(Mask && "Missing call preserved mask for calling convention");
3743 // If this is an invoke in a 32-bit function using a funclet-based
3744 // personality, assume the function clobbers all registers. If an exception
3745 // is thrown, the runtime will not restore CSRs.
3746 // FIXME: Model this more precisely so that we can register allocate across
3747 // the normal edge and spill and fill across the exceptional edge.
3748 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3749 const Function *CallerFn = MF.getFunction();
3750 EHPersonality Pers =
3751 CallerFn->hasPersonalityFn()
3752 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3753 : EHPersonality::Unknown;
3754 if (isFuncletEHPersonality(Pers))
3755 Mask = RegInfo->getNoPreservedMask();
3758 // Define a new register mask from the existing mask.
3759 uint32_t *RegMask = nullptr;
3761 // In some calling conventions we need to remove the used physical registers
3762 // from the reg mask.
3763 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3764 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3766 // Allocate a new Reg Mask and copy Mask.
3767 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3768 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3769 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3771 // Make sure all sub registers of the argument registers are reset
3773 for (auto const &RegPair : RegsToPass)
3774 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3775 SubRegs.isValid(); ++SubRegs)
3776 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3778 // Create the RegMask Operand according to our updated mask.
3779 Ops.push_back(DAG.getRegisterMask(RegMask));
3781 // Create the RegMask Operand according to the static mask.
3782 Ops.push_back(DAG.getRegisterMask(Mask));
3785 if (InFlag.getNode())
3786 Ops.push_back(InFlag);
3790 //// If this is the first return lowered for this function, add the regs
3791 //// to the liveout set for the function.
3792 // This isn't right, although it's probably harmless on x86; liveouts
3793 // should be computed from returns not tail calls. Consider a void
3794 // function making a tail call to a function returning int.
3795 MF.getFrameInfo().setHasTailCall();
3796 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3799 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3800 InFlag = Chain.getValue(1);
3802 // Create the CALLSEQ_END node.
3803 unsigned NumBytesForCalleeToPop;
3804 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3805 DAG.getTarget().Options.GuaranteedTailCallOpt))
3806 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3807 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3808 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3809 SR == StackStructReturn)
3810 // If this is a call to a struct-return function, the callee
3811 // pops the hidden struct pointer, so we have to push it back.
3812 // This is common for Darwin/X86, Linux & Mingw32 targets.
3813 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3814 NumBytesForCalleeToPop = 4;
3816 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3818 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3819 // No need to reset the stack after the call if the call doesn't return. To
3820 // make the MI verify, we'll pretend the callee does it for us.
3821 NumBytesForCalleeToPop = NumBytes;
3824 // Returns a flag for retval copy to use.
3826 Chain = DAG.getCALLSEQ_END(Chain,
3827 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3828 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3831 InFlag = Chain.getValue(1);
3834 // Handle result values, copying them out of physregs into vregs that we
3836 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3840 //===----------------------------------------------------------------------===//
3841 // Fast Calling Convention (tail call) implementation
3842 //===----------------------------------------------------------------------===//
3844 // Like std call, callee cleans arguments, convention except that ECX is
3845 // reserved for storing the tail called function address. Only 2 registers are
3846 // free for argument passing (inreg). Tail call optimization is performed
3848 // * tailcallopt is enabled
3849 // * caller/callee are fastcc
3850 // On X86_64 architecture with GOT-style position independent code only local
3851 // (within module) calls are supported at the moment.
3852 // To keep the stack aligned according to platform abi the function
3853 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3854 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3855 // If a tail called function callee has more arguments than the caller the
3856 // caller needs to make sure that there is room to move the RETADDR to. This is
3857 // achieved by reserving an area the size of the argument delta right after the
3858 // original RETADDR, but before the saved framepointer or the spilled registers
3859 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3871 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3874 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3875 SelectionDAG& DAG) const {
3876 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3877 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3878 unsigned StackAlignment = TFI.getStackAlignment();
3879 uint64_t AlignMask = StackAlignment - 1;
3880 int64_t Offset = StackSize;
3881 unsigned SlotSize = RegInfo->getSlotSize();
3882 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3883 // Number smaller than 12 so just add the difference.
3884 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3886 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3887 Offset = ((~AlignMask) & Offset) + StackAlignment +
3888 (StackAlignment-SlotSize);
3893 /// Return true if the given stack call argument is already available in the
3894 /// same position (relatively) of the caller's incoming argument stack.
3896 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3897 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3898 const X86InstrInfo *TII, const CCValAssign &VA) {
3899 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3902 // Look through nodes that don't alter the bits of the incoming value.
3903 unsigned Op = Arg.getOpcode();
3904 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3905 Arg = Arg.getOperand(0);
3908 if (Op == ISD::TRUNCATE) {
3909 const SDValue &TruncInput = Arg.getOperand(0);
3910 if (TruncInput.getOpcode() == ISD::AssertZext &&
3911 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3912 Arg.getValueType()) {
3913 Arg = TruncInput.getOperand(0);
3921 if (Arg.getOpcode() == ISD::CopyFromReg) {
3922 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3923 if (!TargetRegisterInfo::isVirtualRegister(VR))
3925 MachineInstr *Def = MRI->getVRegDef(VR);
3928 if (!Flags.isByVal()) {
3929 if (!TII->isLoadFromStackSlot(*Def, FI))
3932 unsigned Opcode = Def->getOpcode();
3933 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3934 Opcode == X86::LEA64_32r) &&
3935 Def->getOperand(1).isFI()) {
3936 FI = Def->getOperand(1).getIndex();
3937 Bytes = Flags.getByValSize();
3941 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3942 if (Flags.isByVal())
3943 // ByVal argument is passed in as a pointer but it's now being
3944 // dereferenced. e.g.
3945 // define @foo(%struct.X* %A) {
3946 // tail call @bar(%struct.X* byval %A)
3949 SDValue Ptr = Ld->getBasePtr();
3950 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3953 FI = FINode->getIndex();
3954 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3955 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3956 FI = FINode->getIndex();
3957 Bytes = Flags.getByValSize();
3961 assert(FI != INT_MAX);
3962 if (!MFI.isFixedObjectIndex(FI))
3965 if (Offset != MFI.getObjectOffset(FI))
3968 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3969 // If the argument location is wider than the argument type, check that any
3970 // extension flags match.
3971 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3972 Flags.isSExt() != MFI.isObjectSExt(FI)) {
3977 return Bytes == MFI.getObjectSize(FI);
3980 /// Check whether the call is eligible for tail call optimization. Targets
3981 /// that want to do tail call optimization should implement this function.
3982 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3983 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3984 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3985 const SmallVectorImpl<ISD::OutputArg> &Outs,
3986 const SmallVectorImpl<SDValue> &OutVals,
3987 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3988 if (!mayTailCallThisCC(CalleeCC))
3991 // If -tailcallopt is specified, make fastcc functions tail-callable.
3992 MachineFunction &MF = DAG.getMachineFunction();
3993 const Function *CallerF = MF.getFunction();
3995 // If the function return type is x86_fp80 and the callee return type is not,
3996 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3997 // perform a tailcall optimization here.
3998 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4001 CallingConv::ID CallerCC = CallerF->getCallingConv();
4002 bool CCMatch = CallerCC == CalleeCC;
4003 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4004 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4006 // Win64 functions have extra shadow space for argument homing. Don't do the
4007 // sibcall if the caller and callee have mismatched expectations for this
4009 if (IsCalleeWin64 != IsCallerWin64)
4012 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4013 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4018 // Look for obvious safe cases to perform tail call optimization that do not
4019 // require ABI changes. This is what gcc calls sibcall.
4021 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4022 // emit a special epilogue.
4023 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4024 if (RegInfo->needsStackRealignment(MF))
4027 // Also avoid sibcall optimization if either caller or callee uses struct
4028 // return semantics.
4029 if (isCalleeStructRet || isCallerStructRet)
4032 // Do not sibcall optimize vararg calls unless all arguments are passed via
4034 LLVMContext &C = *DAG.getContext();
4035 if (isVarArg && !Outs.empty()) {
4036 // Optimizing for varargs on Win64 is unlikely to be safe without
4037 // additional testing.
4038 if (IsCalleeWin64 || IsCallerWin64)
4041 SmallVector<CCValAssign, 16> ArgLocs;
4042 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4044 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4045 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4046 if (!ArgLocs[i].isRegLoc())
4050 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4051 // stack. Therefore, if it's not used by the call it is not safe to optimize
4052 // this into a sibcall.
4053 bool Unused = false;
4054 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4061 SmallVector<CCValAssign, 16> RVLocs;
4062 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4063 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4064 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4065 CCValAssign &VA = RVLocs[i];
4066 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4071 // Check that the call results are passed in the same way.
4072 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4073 RetCC_X86, RetCC_X86))
4075 // The callee has to preserve all registers the caller needs to preserve.
4076 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4077 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4079 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4080 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4084 unsigned StackArgsSize = 0;
4086 // If the callee takes no arguments then go on to check the results of the
4088 if (!Outs.empty()) {
4089 // Check if stack adjustment is needed. For now, do not do this if any
4090 // argument is passed on the stack.
4091 SmallVector<CCValAssign, 16> ArgLocs;
4092 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4094 // Allocate shadow area for Win64
4096 CCInfo.AllocateStack(32, 8);
4098 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4099 StackArgsSize = CCInfo.getNextStackOffset();
4101 if (CCInfo.getNextStackOffset()) {
4102 // Check if the arguments are already laid out in the right way as
4103 // the caller's fixed stack objects.
4104 MachineFrameInfo &MFI = MF.getFrameInfo();
4105 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4106 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4107 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4108 CCValAssign &VA = ArgLocs[i];
4109 SDValue Arg = OutVals[i];
4110 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4111 if (VA.getLocInfo() == CCValAssign::Indirect)
4113 if (!VA.isRegLoc()) {
4114 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4121 bool PositionIndependent = isPositionIndependent();
4122 // If the tailcall address may be in a register, then make sure it's
4123 // possible to register allocate for it. In 32-bit, the call address can
4124 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4125 // callee-saved registers are restored. These happen to be the same
4126 // registers used to pass 'inreg' arguments so watch out for those.
4127 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4128 !isa<ExternalSymbolSDNode>(Callee)) ||
4129 PositionIndependent)) {
4130 unsigned NumInRegs = 0;
4131 // In PIC we need an extra register to formulate the address computation
4133 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4135 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4136 CCValAssign &VA = ArgLocs[i];
4139 unsigned Reg = VA.getLocReg();
4142 case X86::EAX: case X86::EDX: case X86::ECX:
4143 if (++NumInRegs == MaxInRegs)
4150 const MachineRegisterInfo &MRI = MF.getRegInfo();
4151 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4155 bool CalleeWillPop =
4156 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4157 MF.getTarget().Options.GuaranteedTailCallOpt);
4159 if (unsigned BytesToPop =
4160 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4161 // If we have bytes to pop, the callee must pop them.
4162 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4163 if (!CalleePopMatches)
4165 } else if (CalleeWillPop && StackArgsSize > 0) {
4166 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4174 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4175 const TargetLibraryInfo *libInfo) const {
4176 return X86::createFastISel(funcInfo, libInfo);
4179 //===----------------------------------------------------------------------===//
4180 // Other Lowering Hooks
4181 //===----------------------------------------------------------------------===//
4183 static bool MayFoldLoad(SDValue Op) {
4184 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4187 static bool MayFoldIntoStore(SDValue Op) {
4188 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4191 static bool MayFoldIntoZeroExtend(SDValue Op) {
4192 if (Op.hasOneUse()) {
4193 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4194 return (ISD::ZERO_EXTEND == Opcode);
4199 static bool isTargetShuffle(unsigned Opcode) {
4201 default: return false;
4202 case X86ISD::BLENDI:
4203 case X86ISD::PSHUFB:
4204 case X86ISD::PSHUFD:
4205 case X86ISD::PSHUFHW:
4206 case X86ISD::PSHUFLW:
4208 case X86ISD::INSERTPS:
4209 case X86ISD::PALIGNR:
4210 case X86ISD::VSHLDQ:
4211 case X86ISD::VSRLDQ:
4212 case X86ISD::MOVLHPS:
4213 case X86ISD::MOVLHPD:
4214 case X86ISD::MOVHLPS:
4215 case X86ISD::MOVLPS:
4216 case X86ISD::MOVLPD:
4217 case X86ISD::MOVSHDUP:
4218 case X86ISD::MOVSLDUP:
4219 case X86ISD::MOVDDUP:
4222 case X86ISD::UNPCKL:
4223 case X86ISD::UNPCKH:
4224 case X86ISD::VBROADCAST:
4225 case X86ISD::VPERMILPI:
4226 case X86ISD::VPERMILPV:
4227 case X86ISD::VPERM2X128:
4228 case X86ISD::VPERMIL2:
4229 case X86ISD::VPERMI:
4230 case X86ISD::VPPERM:
4231 case X86ISD::VPERMV:
4232 case X86ISD::VPERMV3:
4233 case X86ISD::VPERMIV3:
4234 case X86ISD::VZEXT_MOVL:
4239 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4241 default: return false;
4243 case X86ISD::PSHUFB:
4244 case X86ISD::VPERMILPV:
4245 case X86ISD::VPERMIL2:
4246 case X86ISD::VPPERM:
4247 case X86ISD::VPERMV:
4248 case X86ISD::VPERMV3:
4249 case X86ISD::VPERMIV3:
4251 // 'Faux' Target Shuffles.
4258 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4259 MachineFunction &MF = DAG.getMachineFunction();
4260 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4261 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4262 int ReturnAddrIndex = FuncInfo->getRAIndex();
4264 if (ReturnAddrIndex == 0) {
4265 // Set up a frame object for the return address.
4266 unsigned SlotSize = RegInfo->getSlotSize();
4267 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4270 FuncInfo->setRAIndex(ReturnAddrIndex);
4273 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4276 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4277 bool hasSymbolicDisplacement) {
4278 // Offset should fit into 32 bit immediate field.
4279 if (!isInt<32>(Offset))
4282 // If we don't have a symbolic displacement - we don't have any extra
4284 if (!hasSymbolicDisplacement)
4287 // FIXME: Some tweaks might be needed for medium code model.
4288 if (M != CodeModel::Small && M != CodeModel::Kernel)
4291 // For small code model we assume that latest object is 16MB before end of 31
4292 // bits boundary. We may also accept pretty large negative constants knowing
4293 // that all objects are in the positive half of address space.
4294 if (M == CodeModel::Small && Offset < 16*1024*1024)
4297 // For kernel code model we know that all object resist in the negative half
4298 // of 32bits address space. We may not accept negative offsets, since they may
4299 // be just off and we may accept pretty large positive ones.
4300 if (M == CodeModel::Kernel && Offset >= 0)
4306 /// Determines whether the callee is required to pop its own arguments.
4307 /// Callee pop is necessary to support tail calls.
4308 bool X86::isCalleePop(CallingConv::ID CallingConv,
4309 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4310 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4311 // can guarantee TCO.
4312 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4315 switch (CallingConv) {
4318 case CallingConv::X86_StdCall:
4319 case CallingConv::X86_FastCall:
4320 case CallingConv::X86_ThisCall:
4321 case CallingConv::X86_VectorCall:
4326 /// \brief Return true if the condition is an unsigned comparison operation.
4327 static bool isX86CCUnsigned(unsigned X86CC) {
4330 llvm_unreachable("Invalid integer condition!");
4346 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4347 switch (SetCCOpcode) {
4348 default: llvm_unreachable("Invalid integer condition!");
4349 case ISD::SETEQ: return X86::COND_E;
4350 case ISD::SETGT: return X86::COND_G;
4351 case ISD::SETGE: return X86::COND_GE;
4352 case ISD::SETLT: return X86::COND_L;
4353 case ISD::SETLE: return X86::COND_LE;
4354 case ISD::SETNE: return X86::COND_NE;
4355 case ISD::SETULT: return X86::COND_B;
4356 case ISD::SETUGT: return X86::COND_A;
4357 case ISD::SETULE: return X86::COND_BE;
4358 case ISD::SETUGE: return X86::COND_AE;
4362 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4363 /// condition code, returning the condition code and the LHS/RHS of the
4364 /// comparison to make.
4365 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4366 bool isFP, SDValue &LHS, SDValue &RHS,
4367 SelectionDAG &DAG) {
4369 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4370 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4371 // X > -1 -> X == 0, jump !sign.
4372 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4373 return X86::COND_NS;
4375 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4376 // X < 0 -> X == 0, jump on sign.
4379 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4381 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4382 return X86::COND_LE;
4386 return TranslateIntegerX86CC(SetCCOpcode);
4389 // First determine if it is required or is profitable to flip the operands.
4391 // If LHS is a foldable load, but RHS is not, flip the condition.
4392 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4393 !ISD::isNON_EXTLoad(RHS.getNode())) {
4394 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4395 std::swap(LHS, RHS);
4398 switch (SetCCOpcode) {
4404 std::swap(LHS, RHS);
4408 // On a floating point condition, the flags are set as follows:
4410 // 0 | 0 | 0 | X > Y
4411 // 0 | 0 | 1 | X < Y
4412 // 1 | 0 | 0 | X == Y
4413 // 1 | 1 | 1 | unordered
4414 switch (SetCCOpcode) {
4415 default: llvm_unreachable("Condcode should be pre-legalized away");
4417 case ISD::SETEQ: return X86::COND_E;
4418 case ISD::SETOLT: // flipped
4420 case ISD::SETGT: return X86::COND_A;
4421 case ISD::SETOLE: // flipped
4423 case ISD::SETGE: return X86::COND_AE;
4424 case ISD::SETUGT: // flipped
4426 case ISD::SETLT: return X86::COND_B;
4427 case ISD::SETUGE: // flipped
4429 case ISD::SETLE: return X86::COND_BE;
4431 case ISD::SETNE: return X86::COND_NE;
4432 case ISD::SETUO: return X86::COND_P;
4433 case ISD::SETO: return X86::COND_NP;
4435 case ISD::SETUNE: return X86::COND_INVALID;
4439 /// Is there a floating point cmov for the specific X86 condition code?
4440 /// Current x86 isa includes the following FP cmov instructions:
4441 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4442 static bool hasFPCMov(unsigned X86CC) {
4459 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4461 unsigned Intrinsic) const {
4463 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4467 Info.opc = ISD::INTRINSIC_W_CHAIN;
4468 Info.readMem = false;
4469 Info.writeMem = false;
4473 switch (IntrData->Type) {
4474 case EXPAND_FROM_MEM: {
4475 Info.ptrVal = I.getArgOperand(0);
4476 Info.memVT = MVT::getVT(I.getType());
4478 Info.readMem = true;
4481 case COMPRESS_TO_MEM: {
4482 Info.ptrVal = I.getArgOperand(0);
4483 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4485 Info.writeMem = true;
4488 case TRUNCATE_TO_MEM_VI8:
4489 case TRUNCATE_TO_MEM_VI16:
4490 case TRUNCATE_TO_MEM_VI32: {
4491 Info.ptrVal = I.getArgOperand(0);
4492 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4493 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4494 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4496 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4497 ScalarVT = MVT::i16;
4498 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4499 ScalarVT = MVT::i32;
4501 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4503 Info.writeMem = true;
4513 /// Returns true if the target can instruction select the
4514 /// specified FP immediate natively. If false, the legalizer will
4515 /// materialize the FP immediate as a load from a constant pool.
4516 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4517 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4518 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4524 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4525 ISD::LoadExtType ExtTy,
4527 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4528 // relocation target a movq or addq instruction: don't let the load shrink.
4529 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4530 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4531 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4532 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4536 /// \brief Returns true if it is beneficial to convert a load of a constant
4537 /// to just the constant itself.
4538 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4540 assert(Ty->isIntegerTy());
4542 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4543 if (BitSize == 0 || BitSize > 64)
4548 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4549 unsigned Index) const {
4550 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4553 return (Index == 0 || Index == ResVT.getVectorNumElements());
4556 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4557 // Speculate cttz only if we can directly use TZCNT.
4558 return Subtarget.hasBMI();
4561 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4562 // Speculate ctlz only if we can directly use LZCNT.
4563 return Subtarget.hasLZCNT();
4566 bool X86TargetLowering::isCtlzFast() const {
4567 return Subtarget.hasFastLZCNT();
4570 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4571 const Instruction &AndI) const {
4575 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4576 if (!Subtarget.hasBMI())
4579 // There are only 32-bit and 64-bit forms for 'andn'.
4580 EVT VT = Y.getValueType();
4581 if (VT != MVT::i32 && VT != MVT::i64)
4587 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4588 MVT VT = MVT::getIntegerVT(NumBits);
4589 if (isTypeLegal(VT))
4592 // PMOVMSKB can handle this.
4593 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4596 // VPMOVMSKB can handle this.
4597 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4600 // TODO: Allow 64-bit type for 32-bit target.
4601 // TODO: 512-bit types should be allowed, but make sure that those
4602 // cases are handled in combineVectorSizedSetCCEquality().
4604 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4607 /// Val is the undef sentinel value or equal to the specified value.
4608 static bool isUndefOrEqual(int Val, int CmpVal) {
4609 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4612 /// Val is either the undef or zero sentinel value.
4613 static bool isUndefOrZero(int Val) {
4614 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4617 /// Return true if every element in Mask, beginning
4618 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4619 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4620 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4621 if (Mask[i] != SM_SentinelUndef)
4626 /// Return true if Val is undef or if its value falls within the
4627 /// specified range (L, H].
4628 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4629 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4632 /// Return true if every element in Mask is undef or if its value
4633 /// falls within the specified range (L, H].
4634 static bool isUndefOrInRange(ArrayRef<int> Mask,
4637 if (!isUndefOrInRange(M, Low, Hi))
4642 /// Return true if Val is undef, zero or if its value falls within the
4643 /// specified range (L, H].
4644 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4645 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4648 /// Return true if every element in Mask is undef, zero or if its value
4649 /// falls within the specified range (L, H].
4650 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4652 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4657 /// Return true if every element in Mask, beginning
4658 /// from position Pos and ending in Pos+Size, falls within the specified
4659 /// sequential range (Low, Low+Size]. or is undef.
4660 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4661 unsigned Pos, unsigned Size, int Low) {
4662 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4663 if (!isUndefOrEqual(Mask[i], Low))
4668 /// Return true if every element in Mask, beginning
4669 /// from position Pos and ending in Pos+Size, falls within the specified
4670 /// sequential range (Low, Low+Size], or is undef or is zero.
4671 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4672 unsigned Size, int Low) {
4673 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4674 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4679 /// Return true if every element in Mask, beginning
4680 /// from position Pos and ending in Pos+Size is undef or is zero.
4681 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4683 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4684 if (!isUndefOrZero(Mask[i]))
4689 /// \brief Helper function to test whether a shuffle mask could be
4690 /// simplified by widening the elements being shuffled.
4692 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4693 /// leaves it in an unspecified state.
4695 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4696 /// shuffle masks. The latter have the special property of a '-2' representing
4697 /// a zero-ed lane of a vector.
4698 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4699 SmallVectorImpl<int> &WidenedMask) {
4700 WidenedMask.assign(Mask.size() / 2, 0);
4701 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4703 int M1 = Mask[i + 1];
4705 // If both elements are undef, its trivial.
4706 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4707 WidenedMask[i / 2] = SM_SentinelUndef;
4711 // Check for an undef mask and a mask value properly aligned to fit with
4712 // a pair of values. If we find such a case, use the non-undef mask's value.
4713 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4714 WidenedMask[i / 2] = M1 / 2;
4717 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4718 WidenedMask[i / 2] = M0 / 2;
4722 // When zeroing, we need to spread the zeroing across both lanes to widen.
4723 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4724 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4725 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4726 WidenedMask[i / 2] = SM_SentinelZero;
4732 // Finally check if the two mask values are adjacent and aligned with
4734 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4735 WidenedMask[i / 2] = M0 / 2;
4739 // Otherwise we can't safely widen the elements used in this shuffle.
4742 assert(WidenedMask.size() == Mask.size() / 2 &&
4743 "Incorrect size of mask after widening the elements!");
4748 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4749 /// mask index with the scaled sequential indices for an equivalent narrowed
4750 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4752 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4753 SmallVectorImpl<int> &ScaledMask) {
4754 assert(0 < Scale && "Unexpected scaling factor");
4755 int NumElts = Mask.size();
4756 ScaledMask.assign(NumElts * Scale, -1);
4758 for (int i = 0; i != NumElts; ++i) {
4761 // Repeat sentinel values in every mask element.
4763 for (int s = 0; s != Scale; ++s)
4764 ScaledMask[(Scale * i) + s] = M;
4768 // Scale mask element and increment across each mask element.
4769 for (int s = 0; s != Scale; ++s)
4770 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4774 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4775 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4776 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4777 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4778 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4781 // The index should be aligned on a vecWidth-bit boundary.
4782 uint64_t Index = N->getConstantOperandVal(1);
4783 MVT VT = N->getSimpleValueType(0);
4784 unsigned ElSize = VT.getScalarSizeInBits();
4785 return (Index * ElSize) % vecWidth == 0;
4788 /// Return true if the specified INSERT_SUBVECTOR
4789 /// operand specifies a subvector insert that is suitable for input to
4790 /// insertion of 128 or 256-bit subvectors
4791 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4792 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4793 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4796 // The index should be aligned on a vecWidth-bit boundary.
4797 uint64_t Index = N->getConstantOperandVal(2);
4798 MVT VT = N->getSimpleValueType(0);
4799 unsigned ElSize = VT.getScalarSizeInBits();
4800 return (Index * ElSize) % vecWidth == 0;
4803 bool X86::isVINSERT128Index(SDNode *N) {
4804 return isVINSERTIndex(N, 128);
4807 bool X86::isVINSERT256Index(SDNode *N) {
4808 return isVINSERTIndex(N, 256);
4811 bool X86::isVEXTRACT128Index(SDNode *N) {
4812 return isVEXTRACTIndex(N, 128);
4815 bool X86::isVEXTRACT256Index(SDNode *N) {
4816 return isVEXTRACTIndex(N, 256);
4819 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4820 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4821 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4822 "Illegal extract subvector for VEXTRACT");
4824 uint64_t Index = N->getConstantOperandVal(1);
4825 MVT VecVT = N->getOperand(0).getSimpleValueType();
4826 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4827 return Index / NumElemsPerChunk;
4830 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4831 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4832 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4833 "Illegal insert subvector for VINSERT");
4835 uint64_t Index = N->getConstantOperandVal(2);
4836 MVT VecVT = N->getSimpleValueType(0);
4837 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4838 return Index / NumElemsPerChunk;
4841 /// Return the appropriate immediate to extract the specified
4842 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4843 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4844 return getExtractVEXTRACTImmediate(N, 128);
4847 /// Return the appropriate immediate to extract the specified
4848 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4849 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4850 return getExtractVEXTRACTImmediate(N, 256);
4853 /// Return the appropriate immediate to insert at the specified
4854 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4855 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4856 return getInsertVINSERTImmediate(N, 128);
4859 /// Return the appropriate immediate to insert at the specified
4860 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4861 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4862 return getInsertVINSERTImmediate(N, 256);
4865 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4866 bool X86::isZeroNode(SDValue Elt) {
4867 return isNullConstant(Elt) || isNullFPConstant(Elt);
4870 // Build a vector of constants.
4871 // Use an UNDEF node if MaskElt == -1.
4872 // Split 64-bit constants in the 32-bit mode.
4873 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4874 const SDLoc &dl, bool IsMask = false) {
4876 SmallVector<SDValue, 32> Ops;
4879 MVT ConstVecVT = VT;
4880 unsigned NumElts = VT.getVectorNumElements();
4881 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4882 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4883 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4887 MVT EltVT = ConstVecVT.getVectorElementType();
4888 for (unsigned i = 0; i < NumElts; ++i) {
4889 bool IsUndef = Values[i] < 0 && IsMask;
4890 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4891 DAG.getConstant(Values[i], dl, EltVT);
4892 Ops.push_back(OpNode);
4894 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4895 DAG.getConstant(0, dl, EltVT));
4897 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4899 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4903 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4904 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4905 assert(Bits.size() == Undefs.getBitWidth() &&
4906 "Unequal constant and undef arrays");
4907 SmallVector<SDValue, 32> Ops;
4910 MVT ConstVecVT = VT;
4911 unsigned NumElts = VT.getVectorNumElements();
4912 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4913 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4914 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4918 MVT EltVT = ConstVecVT.getVectorElementType();
4919 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4921 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4924 const APInt &V = Bits[i];
4925 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4927 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4928 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4929 } else if (EltVT == MVT::f32) {
4930 APFloat FV(APFloat::IEEEsingle(), V);
4931 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4932 } else if (EltVT == MVT::f64) {
4933 APFloat FV(APFloat::IEEEdouble(), V);
4934 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4936 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4940 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4941 return DAG.getBitcast(VT, ConstsNode);
4944 /// Returns a vector of specified type with all zero elements.
4945 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4946 SelectionDAG &DAG, const SDLoc &dl) {
4947 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4948 VT.getVectorElementType() == MVT::i1) &&
4949 "Unexpected vector type");
4951 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4952 // type. This ensures they get CSE'd. But if the integer type is not
4953 // available, use a floating-point +0.0 instead.
4955 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4956 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4957 } else if (VT.getVectorElementType() == MVT::i1) {
4958 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4959 "Unexpected vector type");
4960 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4961 "Unexpected vector type");
4962 Vec = DAG.getConstant(0, dl, VT);
4964 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4965 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4967 return DAG.getBitcast(VT, Vec);
4970 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4971 const SDLoc &dl, unsigned vectorWidth) {
4972 EVT VT = Vec.getValueType();
4973 EVT ElVT = VT.getVectorElementType();
4974 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4975 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4976 VT.getVectorNumElements()/Factor);
4978 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4979 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4980 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4982 // This is the index of the first element of the vectorWidth-bit chunk
4983 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4984 IdxVal &= ~(ElemsPerChunk - 1);
4986 // If the input is a buildvector just emit a smaller one.
4987 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4988 return DAG.getBuildVector(
4989 ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4991 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4992 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4995 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4996 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4997 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4998 /// instructions or a simple subregister reference. Idx is an index in the
4999 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5000 /// lowering EXTRACT_VECTOR_ELT operations easier.
5001 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5002 SelectionDAG &DAG, const SDLoc &dl) {
5003 assert((Vec.getValueType().is256BitVector() ||
5004 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5005 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5008 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5009 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5010 SelectionDAG &DAG, const SDLoc &dl) {
5011 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5012 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5015 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5016 SelectionDAG &DAG, const SDLoc &dl,
5017 unsigned vectorWidth) {
5018 assert((vectorWidth == 128 || vectorWidth == 256) &&
5019 "Unsupported vector width");
5020 // Inserting UNDEF is Result
5023 EVT VT = Vec.getValueType();
5024 EVT ElVT = VT.getVectorElementType();
5025 EVT ResultVT = Result.getValueType();
5027 // Insert the relevant vectorWidth bits.
5028 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5029 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5031 // This is the index of the first element of the vectorWidth-bit chunk
5032 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5033 IdxVal &= ~(ElemsPerChunk - 1);
5035 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5036 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5039 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5040 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5041 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5042 /// simple superregister reference. Idx is an index in the 128 bits
5043 /// we want. It need not be aligned to a 128-bit boundary. That makes
5044 /// lowering INSERT_VECTOR_ELT operations easier.
5045 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5046 SelectionDAG &DAG, const SDLoc &dl) {
5047 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5048 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5051 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5052 SelectionDAG &DAG, const SDLoc &dl) {
5053 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5054 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5057 /// Insert i1-subvector to i1-vector.
5058 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5059 const X86Subtarget &Subtarget) {
5062 SDValue Vec = Op.getOperand(0);
5063 SDValue SubVec = Op.getOperand(1);
5064 SDValue Idx = Op.getOperand(2);
5066 if (!isa<ConstantSDNode>(Idx))
5069 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5070 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5073 MVT OpVT = Op.getSimpleValueType();
5074 MVT SubVecVT = SubVec.getSimpleValueType();
5075 unsigned NumElems = OpVT.getVectorNumElements();
5076 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5078 assert(IdxVal + SubVecNumElems <= NumElems &&
5079 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5080 "Unexpected index value in INSERT_SUBVECTOR");
5082 // There are 3 possible cases:
5083 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5084 // 2. Subvector should be inserted in the upper part
5085 // (IdxVal + SubVecNumElems == NumElems)
5086 // 3. Subvector should be inserted in the middle (for example v2i1
5087 // to v16i1, index 2)
5089 // extend to natively supported kshift
5090 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5091 MVT WideOpVT = OpVT;
5092 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5095 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5096 SDValue Undef = DAG.getUNDEF(WideOpVT);
5097 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5098 Undef, SubVec, ZeroIdx);
5100 // Extract sub-vector if require.
5101 auto ExtractSubVec = [&](SDValue V) {
5102 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5106 if (Vec.isUndef()) {
5108 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5109 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5112 return ExtractSubVec(WideSubVec);
5115 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5116 NumElems = WideOpVT.getVectorNumElements();
5117 unsigned ShiftLeft = NumElems - SubVecNumElems;
5118 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5119 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5120 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5121 Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5122 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5123 return ExtractSubVec(Vec);
5127 // Zero lower bits of the Vec
5128 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5129 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5130 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5131 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5132 // Merge them together, SubVec should be zero extended.
5133 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5134 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5136 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5137 return ExtractSubVec(Vec);
5140 // Simple case when we put subvector in the upper part
5141 if (IdxVal + SubVecNumElems == NumElems) {
5142 // Zero upper bits of the Vec
5143 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5144 DAG.getConstant(IdxVal, dl, MVT::i8));
5145 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5146 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5147 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5148 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5149 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5150 return ExtractSubVec(Vec);
5152 // Subvector should be inserted in the middle - use shuffle
5153 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5155 SmallVector<int, 64> Mask;
5156 for (unsigned i = 0; i < NumElems; ++i)
5157 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5159 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5162 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5163 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5164 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5165 /// large BUILD_VECTORS.
5166 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5167 unsigned NumElems, SelectionDAG &DAG,
5169 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5170 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5173 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5174 unsigned NumElems, SelectionDAG &DAG,
5176 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5177 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5180 /// Returns a vector of specified type with all bits set.
5181 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5182 /// Then bitcast to their original type, ensuring they get CSE'd.
5183 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5184 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5185 "Expected a 128/256/512-bit vector type");
5187 APInt Ones = APInt::getAllOnesValue(32);
5188 unsigned NumElts = VT.getSizeInBits() / 32;
5189 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5190 return DAG.getBitcast(VT, Vec);
5193 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5194 SelectionDAG &DAG) {
5195 EVT InVT = In.getValueType();
5196 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5198 if (VT.is128BitVector() && InVT.is128BitVector())
5199 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5200 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5202 // For 256-bit vectors, we only need the lower (128-bit) input half.
5203 // For 512-bit vectors, we only need the lower input half or quarter.
5204 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5205 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5206 In = extractSubVector(In, 0, DAG, DL,
5207 std::max(128, (int)VT.getSizeInBits() / Scale));
5210 return DAG.getNode(Opc, DL, VT, In);
5213 /// Generate unpacklo/unpackhi shuffle mask.
5214 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5216 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5217 int NumElts = VT.getVectorNumElements();
5218 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5220 for (int i = 0; i < NumElts; ++i) {
5221 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5222 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5223 Pos += (Unary ? 0 : NumElts * (i % 2));
5224 Pos += (Lo ? 0 : NumEltsInLane / 2);
5225 Mask.push_back(Pos);
5229 /// Returns a vector_shuffle node for an unpackl operation.
5230 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5231 SDValue V1, SDValue V2) {
5232 SmallVector<int, 8> Mask;
5233 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5234 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5237 /// Returns a vector_shuffle node for an unpackh operation.
5238 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5239 SDValue V1, SDValue V2) {
5240 SmallVector<int, 8> Mask;
5241 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5242 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5245 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5246 /// This produces a shuffle where the low element of V2 is swizzled into the
5247 /// zero/undef vector, landing at element Idx.
5248 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5249 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5251 const X86Subtarget &Subtarget,
5252 SelectionDAG &DAG) {
5253 MVT VT = V2.getSimpleValueType();
5255 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5256 int NumElems = VT.getVectorNumElements();
5257 SmallVector<int, 16> MaskVec(NumElems);
5258 for (int i = 0; i != NumElems; ++i)
5259 // If this is the insertion idx, put the low elt of V2 here.
5260 MaskVec[i] = (i == Idx) ? NumElems : i;
5261 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5264 static SDValue peekThroughBitcasts(SDValue V) {
5265 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5266 V = V.getOperand(0);
5270 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5271 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5272 V.getOperand(0).hasOneUse())
5273 V = V.getOperand(0);
5277 static const Constant *getTargetConstantFromNode(SDValue Op) {
5278 Op = peekThroughBitcasts(Op);
5280 auto *Load = dyn_cast<LoadSDNode>(Op);
5284 SDValue Ptr = Load->getBasePtr();
5285 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5286 Ptr->getOpcode() == X86ISD::WrapperRIP)
5287 Ptr = Ptr->getOperand(0);
5289 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5290 if (!CNode || CNode->isMachineConstantPoolEntry())
5293 return dyn_cast<Constant>(CNode->getConstVal());
5296 // Extract raw constant bits from constant pools.
5297 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5299 SmallVectorImpl<APInt> &EltBits,
5300 bool AllowWholeUndefs = true,
5301 bool AllowPartialUndefs = true) {
5302 assert(EltBits.empty() && "Expected an empty EltBits vector");
5304 Op = peekThroughBitcasts(Op);
5306 EVT VT = Op.getValueType();
5307 unsigned SizeInBits = VT.getSizeInBits();
5308 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5309 unsigned NumElts = SizeInBits / EltSizeInBits;
5311 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5312 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5314 // Extract all the undef/constant element data and pack into single bitsets.
5315 APInt UndefBits(SizeInBits, 0);
5316 APInt MaskBits(SizeInBits, 0);
5318 // Split the undef/constant single bitset data into the target elements.
5319 auto SplitBitData = [&]() {
5320 // Don't split if we don't allow undef bits.
5321 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5322 if (UndefBits.getBoolValue() && !AllowUndefs)
5325 UndefElts = APInt(NumElts, 0);
5326 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5328 for (unsigned i = 0; i != NumElts; ++i) {
5329 unsigned BitOffset = i * EltSizeInBits;
5330 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5332 // Only treat an element as UNDEF if all bits are UNDEF.
5333 if (UndefEltBits.isAllOnesValue()) {
5334 if (!AllowWholeUndefs)
5336 UndefElts.setBit(i);
5340 // If only some bits are UNDEF then treat them as zero (or bail if not
5342 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5345 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5346 EltBits[i] = Bits.getZExtValue();
5351 // Collect constant bits and insert into mask/undef bit masks.
5352 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5353 unsigned BitOffset) {
5356 if (isa<UndefValue>(Cst)) {
5357 unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
5358 Undefs.setBits(BitOffset, BitOffset + CstSizeInBits);
5361 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5362 Mask.insertBits(CInt->getValue(), BitOffset);
5365 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5366 Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset);
5372 // Extract constant bits from build vector.
5373 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5374 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5375 const SDValue &Src = Op.getOperand(i);
5376 unsigned BitOffset = i * SrcEltSizeInBits;
5377 if (Src.isUndef()) {
5378 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5381 auto *Cst = cast<ConstantSDNode>(Src);
5382 APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5383 MaskBits.insertBits(Bits, BitOffset);
5385 return SplitBitData();
5388 // Extract constant bits from constant pool vector.
5389 if (auto *Cst = getTargetConstantFromNode(Op)) {
5390 Type *CstTy = Cst->getType();
5391 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5394 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5395 for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i)
5396 if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits,
5397 i * CstEltSizeInBits))
5400 return SplitBitData();
5403 // Extract constant bits from a broadcasted constant pool scalar.
5404 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5405 EltSizeInBits <= SrcEltSizeInBits) {
5406 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5407 APInt Bits(SizeInBits, 0);
5408 APInt Undefs(SizeInBits, 0);
5409 if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) {
5410 for (unsigned i = 0; i != NumSrcElts; ++i) {
5411 MaskBits |= Bits.shl(i * SrcEltSizeInBits);
5412 UndefBits |= Undefs.shl(i * SrcEltSizeInBits);
5414 return SplitBitData();
5419 // Extract a rematerialized scalar constant insertion.
5420 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5421 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5422 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5423 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5424 MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5425 MaskBits = MaskBits.zext(SizeInBits);
5426 return SplitBitData();
5432 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5433 unsigned MaskEltSizeInBits,
5434 SmallVectorImpl<uint64_t> &RawMask) {
5436 SmallVector<APInt, 64> EltBits;
5438 // Extract the raw target constant bits.
5439 // FIXME: We currently don't support UNDEF bits or mask entries.
5440 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5441 EltBits, /* AllowWholeUndefs */ false,
5442 /* AllowPartialUndefs */ false))
5445 // Insert the extracted elements into the mask.
5446 for (APInt Elt : EltBits)
5447 RawMask.push_back(Elt.getZExtValue());
5452 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5453 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5454 /// operands in \p Ops, and returns true.
5455 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5456 /// IsUnary for shuffles which use a single input multiple times, and in those
5457 /// cases it will adjust the mask to only have indices within that single input.
5458 /// It is an error to call this with non-empty Mask/Ops vectors.
5459 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5460 SmallVectorImpl<SDValue> &Ops,
5461 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5462 unsigned NumElems = VT.getVectorNumElements();
5465 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5466 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5469 bool IsFakeUnary = false;
5470 switch(N->getOpcode()) {
5471 case X86ISD::BLENDI:
5472 ImmN = N->getOperand(N->getNumOperands()-1);
5473 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5474 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5477 ImmN = N->getOperand(N->getNumOperands()-1);
5478 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5479 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5481 case X86ISD::INSERTPS:
5482 ImmN = N->getOperand(N->getNumOperands()-1);
5483 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5484 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5486 case X86ISD::UNPCKH:
5487 DecodeUNPCKHMask(VT, Mask);
5488 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5490 case X86ISD::UNPCKL:
5491 DecodeUNPCKLMask(VT, Mask);
5492 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5494 case X86ISD::MOVHLPS:
5495 DecodeMOVHLPSMask(NumElems, Mask);
5496 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5498 case X86ISD::MOVLHPS:
5499 DecodeMOVLHPSMask(NumElems, Mask);
5500 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5502 case X86ISD::PALIGNR:
5503 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5504 ImmN = N->getOperand(N->getNumOperands()-1);
5505 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5506 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5507 Ops.push_back(N->getOperand(1));
5508 Ops.push_back(N->getOperand(0));
5510 case X86ISD::VSHLDQ:
5511 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5512 ImmN = N->getOperand(N->getNumOperands() - 1);
5513 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5516 case X86ISD::VSRLDQ:
5517 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5518 ImmN = N->getOperand(N->getNumOperands() - 1);
5519 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5522 case X86ISD::PSHUFD:
5523 case X86ISD::VPERMILPI:
5524 ImmN = N->getOperand(N->getNumOperands()-1);
5525 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5528 case X86ISD::PSHUFHW:
5529 ImmN = N->getOperand(N->getNumOperands()-1);
5530 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5533 case X86ISD::PSHUFLW:
5534 ImmN = N->getOperand(N->getNumOperands()-1);
5535 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5538 case X86ISD::VZEXT_MOVL:
5539 DecodeZeroMoveLowMask(VT, Mask);
5542 case X86ISD::VBROADCAST: {
5543 SDValue N0 = N->getOperand(0);
5544 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5545 // add the pre-extracted value to the Ops vector.
5546 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5547 N0.getOperand(0).getValueType() == VT &&
5548 N0.getConstantOperandVal(1) == 0)
5549 Ops.push_back(N0.getOperand(0));
5551 // We only decode broadcasts of same-sized vectors, unless the broadcast
5552 // came from an extract from the original width. If we found one, we
5553 // pushed it the Ops vector above.
5554 if (N0.getValueType() == VT || !Ops.empty()) {
5555 DecodeVectorBroadcast(VT, Mask);
5561 case X86ISD::VPERMILPV: {
5563 SDValue MaskNode = N->getOperand(1);
5564 unsigned MaskEltSize = VT.getScalarSizeInBits();
5565 SmallVector<uint64_t, 32> RawMask;
5566 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5567 DecodeVPERMILPMask(VT, RawMask, Mask);
5570 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5571 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5576 case X86ISD::PSHUFB: {
5578 SDValue MaskNode = N->getOperand(1);
5579 SmallVector<uint64_t, 32> RawMask;
5580 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5581 DecodePSHUFBMask(RawMask, Mask);
5584 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5585 DecodePSHUFBMask(C, Mask);
5590 case X86ISD::VPERMI:
5591 ImmN = N->getOperand(N->getNumOperands()-1);
5592 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5597 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5599 case X86ISD::VPERM2X128:
5600 ImmN = N->getOperand(N->getNumOperands()-1);
5601 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5602 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5604 case X86ISD::MOVSLDUP:
5605 DecodeMOVSLDUPMask(VT, Mask);
5608 case X86ISD::MOVSHDUP:
5609 DecodeMOVSHDUPMask(VT, Mask);
5612 case X86ISD::MOVDDUP:
5613 DecodeMOVDDUPMask(VT, Mask);
5616 case X86ISD::MOVLHPD:
5617 case X86ISD::MOVLPD:
5618 case X86ISD::MOVLPS:
5619 // Not yet implemented
5621 case X86ISD::VPERMIL2: {
5622 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5623 unsigned MaskEltSize = VT.getScalarSizeInBits();
5624 SDValue MaskNode = N->getOperand(2);
5625 SDValue CtrlNode = N->getOperand(3);
5626 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5627 unsigned CtrlImm = CtrlOp->getZExtValue();
5628 SmallVector<uint64_t, 32> RawMask;
5629 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5630 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5633 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5634 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5640 case X86ISD::VPPERM: {
5641 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5642 SDValue MaskNode = N->getOperand(2);
5643 SmallVector<uint64_t, 32> RawMask;
5644 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5645 DecodeVPPERMMask(RawMask, Mask);
5648 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5649 DecodeVPPERMMask(C, Mask);
5654 case X86ISD::VPERMV: {
5656 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5657 Ops.push_back(N->getOperand(1));
5658 SDValue MaskNode = N->getOperand(0);
5659 SmallVector<uint64_t, 32> RawMask;
5660 unsigned MaskEltSize = VT.getScalarSizeInBits();
5661 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5662 DecodeVPERMVMask(RawMask, Mask);
5665 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5666 DecodeVPERMVMask(C, MaskEltSize, Mask);
5671 case X86ISD::VPERMV3: {
5672 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5673 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5674 Ops.push_back(N->getOperand(0));
5675 Ops.push_back(N->getOperand(2));
5676 SDValue MaskNode = N->getOperand(1);
5677 unsigned MaskEltSize = VT.getScalarSizeInBits();
5678 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5679 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5684 case X86ISD::VPERMIV3: {
5685 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5686 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5687 Ops.push_back(N->getOperand(1));
5688 Ops.push_back(N->getOperand(2));
5689 SDValue MaskNode = N->getOperand(0);
5690 unsigned MaskEltSize = VT.getScalarSizeInBits();
5691 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5692 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5697 default: llvm_unreachable("unknown target shuffle node");
5700 // Empty mask indicates the decode failed.
5704 // Check if we're getting a shuffle mask with zero'd elements.
5705 if (!AllowSentinelZero)
5706 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5709 // If we have a fake unary shuffle, the shuffle mask is spread across two
5710 // inputs that are actually the same node. Re-map the mask to always point
5711 // into the first input.
5714 if (M >= (int)Mask.size())
5717 // If we didn't already add operands in the opcode-specific code, default to
5718 // adding 1 or 2 operands starting at 0.
5720 Ops.push_back(N->getOperand(0));
5721 if (!IsUnary || IsFakeUnary)
5722 Ops.push_back(N->getOperand(1));
5728 /// Check a target shuffle mask's inputs to see if we can set any values to
5729 /// SM_SentinelZero - this is for elements that are known to be zero
5730 /// (not just zeroable) from their inputs.
5731 /// Returns true if the target shuffle mask was decoded.
5732 static bool setTargetShuffleZeroElements(SDValue N,
5733 SmallVectorImpl<int> &Mask,
5734 SmallVectorImpl<SDValue> &Ops) {
5736 if (!isTargetShuffle(N.getOpcode()))
5739 MVT VT = N.getSimpleValueType();
5740 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5743 SDValue V1 = Ops[0];
5744 SDValue V2 = IsUnary ? V1 : Ops[1];
5746 V1 = peekThroughBitcasts(V1);
5747 V2 = peekThroughBitcasts(V2);
5749 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5750 "Illegal split of shuffle value type");
5751 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5753 // Extract known constant input data.
5754 APInt UndefSrcElts[2];
5755 SmallVector<APInt, 32> SrcEltBits[2];
5756 bool IsSrcConstant[2] = {
5757 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5758 SrcEltBits[0], true, false),
5759 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5760 SrcEltBits[1], true, false)};
5762 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5765 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5769 // Determine shuffle input and normalize the mask.
5770 unsigned SrcIdx = M / Size;
5771 SDValue V = M < Size ? V1 : V2;
5774 // We are referencing an UNDEF input.
5776 Mask[i] = SM_SentinelUndef;
5780 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5781 // TODO: We currently only set UNDEF for integer types - floats use the same
5782 // registers as vectors and many of the scalar folded loads rely on the
5783 // SCALAR_TO_VECTOR pattern.
5784 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5785 (Size % V.getValueType().getVectorNumElements()) == 0) {
5786 int Scale = Size / V.getValueType().getVectorNumElements();
5787 int Idx = M / Scale;
5788 if (Idx != 0 && !VT.isFloatingPoint())
5789 Mask[i] = SM_SentinelUndef;
5790 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5791 Mask[i] = SM_SentinelZero;
5795 // Attempt to extract from the source's constant bits.
5796 if (IsSrcConstant[SrcIdx]) {
5797 if (UndefSrcElts[SrcIdx][M])
5798 Mask[i] = SM_SentinelUndef;
5799 else if (SrcEltBits[SrcIdx][M] == 0)
5800 Mask[i] = SM_SentinelZero;
5804 assert(VT.getVectorNumElements() == Mask.size() &&
5805 "Different mask size from vector size!");
5809 // Attempt to decode ops that could be represented as a shuffle mask.
5810 // The decoded shuffle mask may contain a different number of elements to the
5811 // destination value type.
5812 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5813 SmallVectorImpl<SDValue> &Ops) {
5817 MVT VT = N.getSimpleValueType();
5818 unsigned NumElts = VT.getVectorNumElements();
5819 unsigned NumSizeInBits = VT.getSizeInBits();
5820 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5821 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5822 "Expected byte aligned value types");
5824 unsigned Opcode = N.getOpcode();
5827 case X86ISD::ANDNP: {
5828 // Attempt to decode as a per-byte mask.
5830 SmallVector<APInt, 32> EltBits;
5831 SDValue N0 = N.getOperand(0);
5832 SDValue N1 = N.getOperand(1);
5833 bool IsAndN = (X86ISD::ANDNP == Opcode);
5834 uint64_t ZeroMask = IsAndN ? 255 : 0;
5835 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5837 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5839 Mask.push_back(SM_SentinelUndef);
5842 uint64_t ByteBits = EltBits[i].getZExtValue();
5843 if (ByteBits != 0 && ByteBits != 255)
5845 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5847 Ops.push_back(IsAndN ? N1 : N0);
5850 case ISD::SCALAR_TO_VECTOR: {
5851 // Match against a scalar_to_vector of an extract from a similar vector.
5852 SDValue N0 = N.getOperand(0);
5853 if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5854 N0.getOperand(0).getValueType() != VT ||
5855 !isa<ConstantSDNode>(N0.getOperand(1)) ||
5856 NumElts <= N0.getConstantOperandVal(1) ||
5857 !N->isOnlyUserOf(N0.getNode()))
5859 Ops.push_back(N0.getOperand(0));
5860 Mask.push_back(N0.getConstantOperandVal(1));
5861 Mask.append(NumElts - 1, SM_SentinelUndef);
5864 case X86ISD::PINSRB:
5865 case X86ISD::PINSRW: {
5866 SDValue InVec = N.getOperand(0);
5867 SDValue InScl = N.getOperand(1);
5868 uint64_t InIdx = N.getConstantOperandVal(2);
5869 assert(InIdx < NumElts && "Illegal insertion index");
5871 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
5872 if (X86::isZeroNode(InScl)) {
5873 Ops.push_back(InVec);
5874 for (unsigned i = 0; i != NumElts; ++i)
5875 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
5879 // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
5880 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
5882 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
5883 if (InScl.getOpcode() != ISD::AssertZext ||
5884 InScl.getOperand(0).getOpcode() != ExOp)
5887 SDValue ExVec = InScl.getOperand(0).getOperand(0);
5888 uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
5889 assert(ExIdx < NumElts && "Illegal extraction index");
5890 Ops.push_back(InVec);
5891 Ops.push_back(ExVec);
5892 for (unsigned i = 0; i != NumElts; ++i)
5893 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
5897 case X86ISD::VSRLI: {
5898 uint64_t ShiftVal = N.getConstantOperandVal(1);
5899 // Out of range bit shifts are guaranteed to be zero.
5900 if (NumBitsPerElt <= ShiftVal) {
5901 Mask.append(NumElts, SM_SentinelZero);
5905 // We can only decode 'whole byte' bit shifts as shuffles.
5906 if ((ShiftVal % 8) != 0)
5909 uint64_t ByteShift = ShiftVal / 8;
5910 unsigned NumBytes = NumSizeInBits / 8;
5911 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5912 Ops.push_back(N.getOperand(0));
5914 // Clear mask to all zeros and insert the shifted byte indices.
5915 Mask.append(NumBytes, SM_SentinelZero);
5917 if (X86ISD::VSHLI == Opcode) {
5918 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5919 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5920 Mask[i + j] = i + j - ByteShift;
5922 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5923 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5924 Mask[i + j - ByteShift] = i + j;
5928 case ISD::ZERO_EXTEND_VECTOR_INREG:
5929 case X86ISD::VZEXT: {
5930 // TODO - add support for VPMOVZX with smaller input vector types.
5931 SDValue Src = N.getOperand(0);
5932 MVT SrcVT = Src.getSimpleValueType();
5933 if (NumSizeInBits != SrcVT.getSizeInBits())
5935 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
5944 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
5945 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
5946 SmallVectorImpl<int> &Mask) {
5947 int MaskWidth = Mask.size();
5948 SmallVector<SDValue, 16> UsedInputs;
5949 for (int i = 0, e = Inputs.size(); i < e; ++i) {
5950 int lo = UsedInputs.size() * MaskWidth;
5951 int hi = lo + MaskWidth;
5952 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
5953 UsedInputs.push_back(Inputs[i]);
5960 Inputs = UsedInputs;
5963 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5964 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5965 /// remaining input indices in case we now have a unary shuffle and adjust the
5966 /// inputs accordingly.
5967 /// Returns true if the target shuffle mask was decoded.
5968 static bool resolveTargetShuffleInputs(SDValue Op,
5969 SmallVectorImpl<SDValue> &Inputs,
5970 SmallVectorImpl<int> &Mask) {
5971 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
5972 if (!getFauxShuffleMask(Op, Mask, Inputs))
5975 resolveTargetShuffleInputsAndMask(Inputs, Mask);
5979 /// Returns the scalar element that will make up the ith
5980 /// element of the result of the vector shuffle.
5981 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5984 return SDValue(); // Limit search depth.
5986 SDValue V = SDValue(N, 0);
5987 EVT VT = V.getValueType();
5988 unsigned Opcode = V.getOpcode();
5990 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5991 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5992 int Elt = SV->getMaskElt(Index);
5995 return DAG.getUNDEF(VT.getVectorElementType());
5997 unsigned NumElems = VT.getVectorNumElements();
5998 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5999 : SV->getOperand(1);
6000 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6003 // Recurse into target specific vector shuffles to find scalars.
6004 if (isTargetShuffle(Opcode)) {
6005 MVT ShufVT = V.getSimpleValueType();
6006 MVT ShufSVT = ShufVT.getVectorElementType();
6007 int NumElems = (int)ShufVT.getVectorNumElements();
6008 SmallVector<int, 16> ShuffleMask;
6009 SmallVector<SDValue, 16> ShuffleOps;
6012 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6015 int Elt = ShuffleMask[Index];
6016 if (Elt == SM_SentinelZero)
6017 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6018 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6019 if (Elt == SM_SentinelUndef)
6020 return DAG.getUNDEF(ShufSVT);
6022 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6023 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6024 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6028 // Actual nodes that may contain scalar elements
6029 if (Opcode == ISD::BITCAST) {
6030 V = V.getOperand(0);
6031 EVT SrcVT = V.getValueType();
6032 unsigned NumElems = VT.getVectorNumElements();
6034 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6038 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6039 return (Index == 0) ? V.getOperand(0)
6040 : DAG.getUNDEF(VT.getVectorElementType());
6042 if (V.getOpcode() == ISD::BUILD_VECTOR)
6043 return V.getOperand(Index);
6048 /// Custom lower build_vector of v16i8.
6049 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6050 unsigned NumNonZero, unsigned NumZero,
6052 const X86Subtarget &Subtarget) {
6053 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6060 // SSE4.1 - use PINSRB to insert each byte directly.
6061 if (Subtarget.hasSSE41()) {
6062 for (unsigned i = 0; i < 16; ++i) {
6063 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6065 // If the build vector contains zeros or our first insertion is not the
6066 // first index then insert into zero vector to break any register
6067 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6070 if (NumZero || 0 != i)
6071 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6073 assert(0 == i && "Expected insertion into zero-index");
6074 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6075 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6076 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6077 V = DAG.getBitcast(MVT::v16i8, V);
6081 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6082 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6089 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6090 for (unsigned i = 0; i < 16; ++i) {
6091 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6092 if (ThisIsNonZero && First) {
6094 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6096 V = DAG.getUNDEF(MVT::v8i16);
6101 // FIXME: Investigate extending to i32 instead of just i16.
6102 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6103 SDValue ThisElt, LastElt;
6104 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6105 if (LastIsNonZero) {
6107 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6109 if (ThisIsNonZero) {
6110 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6111 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6112 DAG.getConstant(8, dl, MVT::i8));
6114 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6120 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6121 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6122 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6123 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6124 V = DAG.getBitcast(MVT::v8i16, V);
6126 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6127 DAG.getIntPtrConstant(i / 2, dl));
6133 return DAG.getBitcast(MVT::v16i8, V);
6136 /// Custom lower build_vector of v8i16.
6137 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6138 unsigned NumNonZero, unsigned NumZero,
6140 const X86Subtarget &Subtarget) {
6141 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6147 for (unsigned i = 0; i < 8; ++i) {
6148 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6150 // If the build vector contains zeros or our first insertion is not the
6151 // first index then insert into zero vector to break any register
6152 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6155 if (NumZero || 0 != i)
6156 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6158 assert(0 == i && "Expected insertion into zero-index");
6159 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6160 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6161 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6162 V = DAG.getBitcast(MVT::v8i16, V);
6166 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6167 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6174 /// Custom lower build_vector of v4i32 or v4f32.
6175 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6176 const X86Subtarget &Subtarget) {
6177 // Find all zeroable elements.
6178 std::bitset<4> Zeroable;
6179 for (int i=0; i < 4; ++i) {
6180 SDValue Elt = Op->getOperand(i);
6181 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6183 assert(Zeroable.size() - Zeroable.count() > 1 &&
6184 "We expect at least two non-zero elements!");
6186 // We only know how to deal with build_vector nodes where elements are either
6187 // zeroable or extract_vector_elt with constant index.
6188 SDValue FirstNonZero;
6189 unsigned FirstNonZeroIdx;
6190 for (unsigned i=0; i < 4; ++i) {
6193 SDValue Elt = Op->getOperand(i);
6194 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6195 !isa<ConstantSDNode>(Elt.getOperand(1)))
6197 // Make sure that this node is extracting from a 128-bit vector.
6198 MVT VT = Elt.getOperand(0).getSimpleValueType();
6199 if (!VT.is128BitVector())
6201 if (!FirstNonZero.getNode()) {
6203 FirstNonZeroIdx = i;
6207 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6208 SDValue V1 = FirstNonZero.getOperand(0);
6209 MVT VT = V1.getSimpleValueType();
6211 // See if this build_vector can be lowered as a blend with zero.
6213 unsigned EltMaskIdx, EltIdx;
6215 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6216 if (Zeroable[EltIdx]) {
6217 // The zero vector will be on the right hand side.
6218 Mask[EltIdx] = EltIdx+4;
6222 Elt = Op->getOperand(EltIdx);
6223 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6224 EltMaskIdx = Elt.getConstantOperandVal(1);
6225 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6227 Mask[EltIdx] = EltIdx;
6231 // Let the shuffle legalizer deal with blend operations.
6232 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6233 if (V1.getSimpleValueType() != VT)
6234 V1 = DAG.getBitcast(VT, V1);
6235 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6238 // See if we can lower this build_vector to a INSERTPS.
6239 if (!Subtarget.hasSSE41())
6242 SDValue V2 = Elt.getOperand(0);
6243 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6246 bool CanFold = true;
6247 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6251 SDValue Current = Op->getOperand(i);
6252 SDValue SrcVector = Current->getOperand(0);
6255 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6261 assert(V1.getNode() && "Expected at least two non-zero elements!");
6262 if (V1.getSimpleValueType() != MVT::v4f32)
6263 V1 = DAG.getBitcast(MVT::v4f32, V1);
6264 if (V2.getSimpleValueType() != MVT::v4f32)
6265 V2 = DAG.getBitcast(MVT::v4f32, V2);
6267 // Ok, we can emit an INSERTPS instruction.
6268 unsigned ZMask = Zeroable.to_ulong();
6270 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6271 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6273 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6274 DAG.getIntPtrConstant(InsertPSMask, DL));
6275 return DAG.getBitcast(VT, Result);
6278 /// Return a vector logical shift node.
6279 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6280 SelectionDAG &DAG, const TargetLowering &TLI,
6282 assert(VT.is128BitVector() && "Unknown type for VShift");
6283 MVT ShVT = MVT::v16i8;
6284 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6285 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6286 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6287 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6288 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6289 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6292 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6293 SelectionDAG &DAG) {
6295 // Check if the scalar load can be widened into a vector load. And if
6296 // the address is "base + cst" see if the cst can be "absorbed" into
6297 // the shuffle mask.
6298 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6299 SDValue Ptr = LD->getBasePtr();
6300 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6302 EVT PVT = LD->getValueType(0);
6303 if (PVT != MVT::i32 && PVT != MVT::f32)
6308 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6309 FI = FINode->getIndex();
6311 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6312 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6313 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6314 Offset = Ptr.getConstantOperandVal(1);
6315 Ptr = Ptr.getOperand(0);
6320 // FIXME: 256-bit vector instructions don't require a strict alignment,
6321 // improve this code to support it better.
6322 unsigned RequiredAlign = VT.getSizeInBits()/8;
6323 SDValue Chain = LD->getChain();
6324 // Make sure the stack object alignment is at least 16 or 32.
6325 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6326 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6327 if (MFI.isFixedObjectIndex(FI)) {
6328 // Can't change the alignment. FIXME: It's possible to compute
6329 // the exact stack offset and reference FI + adjust offset instead.
6330 // If someone *really* cares about this. That's the way to implement it.
6333 MFI.setObjectAlignment(FI, RequiredAlign);
6337 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6338 // Ptr + (Offset & ~15).
6341 if ((Offset % RequiredAlign) & 3)
6343 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6346 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6347 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6350 int EltNo = (Offset - StartOffset) >> 2;
6351 unsigned NumElems = VT.getVectorNumElements();
6353 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6354 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6355 LD->getPointerInfo().getWithOffset(StartOffset));
6357 SmallVector<int, 8> Mask(NumElems, EltNo);
6359 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6365 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6366 /// elements can be replaced by a single large load which has the same value as
6367 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6369 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6370 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6371 const SDLoc &DL, SelectionDAG &DAG,
6372 bool isAfterLegalize) {
6373 unsigned NumElems = Elts.size();
6375 int LastLoadedElt = -1;
6376 SmallBitVector LoadMask(NumElems, false);
6377 SmallBitVector ZeroMask(NumElems, false);
6378 SmallBitVector UndefMask(NumElems, false);
6380 // For each element in the initializer, see if we've found a load, zero or an
6382 for (unsigned i = 0; i < NumElems; ++i) {
6383 SDValue Elt = peekThroughBitcasts(Elts[i]);
6388 UndefMask[i] = true;
6389 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6391 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6394 // Each loaded element must be the correct fractional portion of the
6395 // requested vector load.
6396 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6401 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6402 "Incomplete element masks");
6404 // Handle Special Cases - all undef or undef/zero.
6405 if (UndefMask.count() == NumElems)
6406 return DAG.getUNDEF(VT);
6408 // FIXME: Should we return this as a BUILD_VECTOR instead?
6409 if ((ZeroMask | UndefMask).count() == NumElems)
6410 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6411 : DAG.getConstantFP(0.0, DL, VT);
6413 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6414 int FirstLoadedElt = LoadMask.find_first();
6415 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6416 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6417 EVT LDBaseVT = EltBase.getValueType();
6419 // Consecutive loads can contain UNDEFS but not ZERO elements.
6420 // Consecutive loads with UNDEFs and ZEROs elements require a
6421 // an additional shuffle stage to clear the ZERO elements.
6422 bool IsConsecutiveLoad = true;
6423 bool IsConsecutiveLoadWithZeros = true;
6424 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6426 SDValue Elt = peekThroughBitcasts(Elts[i]);
6427 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6428 if (!DAG.areNonVolatileConsecutiveLoads(
6429 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6430 i - FirstLoadedElt)) {
6431 IsConsecutiveLoad = false;
6432 IsConsecutiveLoadWithZeros = false;
6435 } else if (ZeroMask[i]) {
6436 IsConsecutiveLoad = false;
6440 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6441 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6442 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6443 "Cannot merge volatile loads.");
6445 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6446 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6448 if (LDBase->hasAnyUseOfValue(1)) {
6450 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6451 SDValue(NewLd.getNode(), 1));
6452 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6453 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6454 SDValue(NewLd.getNode(), 1));
6460 // LOAD - all consecutive load/undefs (must start/end with a load).
6461 // If we have found an entire vector of loads and undefs, then return a large
6462 // load of the entire vector width starting at the base pointer.
6463 // If the vector contains zeros, then attempt to shuffle those elements.
6464 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6465 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6466 assert(LDBase && "Did not find base load for merging consecutive loads");
6467 EVT EltVT = LDBase->getValueType(0);
6468 // Ensure that the input vector size for the merged loads matches the
6469 // cumulative size of the input elements.
6470 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6473 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6476 if (IsConsecutiveLoad)
6477 return CreateLoad(VT, LDBase);
6479 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6480 // vector and a zero vector to clear out the zero elements.
6481 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6482 SmallVector<int, 4> ClearMask(NumElems, -1);
6483 for (unsigned i = 0; i < NumElems; ++i) {
6485 ClearMask[i] = i + NumElems;
6486 else if (LoadMask[i])
6489 SDValue V = CreateLoad(VT, LDBase);
6490 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6491 : DAG.getConstantFP(0.0, DL, VT);
6492 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6497 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6499 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6500 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6501 (LoadSize == 32 || LoadSize == 64) &&
6502 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6503 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6504 : MVT::getIntegerVT(LoadSize);
6505 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6506 if (TLI.isTypeLegal(VecVT)) {
6507 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6508 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6510 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6511 LDBase->getPointerInfo(),
6512 LDBase->getAlignment(),
6513 false/*isVolatile*/, true/*ReadMem*/,
6516 // Make sure the newly-created LOAD is in the same position as LDBase in
6517 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
6518 // and update uses of LDBase's output chain to use the TokenFactor.
6519 if (LDBase->hasAnyUseOfValue(1)) {
6521 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6522 SDValue(ResNode.getNode(), 1));
6523 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6524 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6525 SDValue(ResNode.getNode(), 1));
6528 return DAG.getBitcast(VT, ResNode);
6535 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6536 unsigned SplatBitSize, LLVMContext &C) {
6537 unsigned ScalarSize = VT.getScalarSizeInBits();
6538 unsigned NumElm = SplatBitSize / ScalarSize;
6540 SmallVector<Constant *, 32> ConstantVec;
6541 for (unsigned i = 0; i < NumElm; i++) {
6542 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6544 if (VT.isFloatingPoint()) {
6545 assert((ScalarSize == 32 || ScalarSize == 64) &&
6546 "Unsupported floating point scalar size");
6547 if (ScalarSize == 32)
6548 Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
6550 Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
6552 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6553 ConstantVec.push_back(Const);
6555 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6558 static bool isUseOfShuffle(SDNode *N) {
6559 for (auto *U : N->uses()) {
6560 if (isTargetShuffle(U->getOpcode()))
6562 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6563 return isUseOfShuffle(U);
6568 /// Attempt to use the vbroadcast instruction to generate a splat value
6569 /// from a splat BUILD_VECTOR which uses:
6570 /// a. A single scalar load, or a constant.
6571 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6573 /// The VBROADCAST node is returned when a pattern is found,
6574 /// or SDValue() otherwise.
6575 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6576 const X86Subtarget &Subtarget,
6577 SelectionDAG &DAG) {
6578 // VBROADCAST requires AVX.
6579 // TODO: Splats could be generated for non-AVX CPUs using SSE
6580 // instructions, but there's less potential gain for only 128-bit vectors.
6581 if (!Subtarget.hasAVX())
6584 MVT VT = BVOp->getSimpleValueType(0);
6587 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6588 "Unsupported vector type for broadcast.");
6590 BitVector UndefElements;
6591 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6593 // We need a splat of a single value to use broadcast, and it doesn't
6594 // make any sense if the value is only in one element of the vector.
6595 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6596 APInt SplatValue, Undef;
6597 unsigned SplatBitSize;
6599 // Check if this is a repeated constant pattern suitable for broadcasting.
6600 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6601 SplatBitSize > VT.getScalarSizeInBits() &&
6602 SplatBitSize < VT.getSizeInBits()) {
6603 // Avoid replacing with broadcast when it's a use of a shuffle
6604 // instruction to preserve the present custom lowering of shuffles.
6605 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6607 // replace BUILD_VECTOR with broadcast of the repeated constants.
6608 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6609 LLVMContext *Ctx = DAG.getContext();
6610 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6611 if (Subtarget.hasAVX()) {
6612 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6613 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6614 // Splatted value can fit in one INTEGER constant in constant pool.
6615 // Load the constant and broadcast it.
6616 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6617 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6618 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6619 SDValue CP = DAG.getConstantPool(C, PVT);
6620 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6622 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6624 CVT, dl, DAG.getEntryNode(), CP,
6625 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6627 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6628 MVT::getVectorVT(CVT, Repeat), Ld);
6629 return DAG.getBitcast(VT, Brdcst);
6630 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6631 // Splatted value can fit in one FLOAT constant in constant pool.
6632 // Load the constant and broadcast it.
6633 // AVX have support for 32 and 64 bit broadcast for floats only.
6634 // No 64bit integer in 32bit subtarget.
6635 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6636 Constant *C = SplatBitSize == 32
6637 ? ConstantFP::get(Type::getFloatTy(*Ctx),
6638 SplatValue.bitsToFloat())
6639 : ConstantFP::get(Type::getDoubleTy(*Ctx),
6640 SplatValue.bitsToDouble());
6641 SDValue CP = DAG.getConstantPool(C, PVT);
6642 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6644 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6646 CVT, dl, DAG.getEntryNode(), CP,
6647 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6649 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6650 MVT::getVectorVT(CVT, Repeat), Ld);
6651 return DAG.getBitcast(VT, Brdcst);
6652 } else if (SplatBitSize > 64) {
6653 // Load the vector of constants and broadcast it.
6654 MVT CVT = VT.getScalarType();
6655 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6657 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6658 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6659 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6661 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6662 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6664 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6665 return DAG.getBitcast(VT, Brdcst);
6672 bool ConstSplatVal =
6673 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6675 // Make sure that all of the users of a non-constant load are from the
6676 // BUILD_VECTOR node.
6677 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6680 unsigned ScalarSize = Ld.getValueSizeInBits();
6681 bool IsGE256 = (VT.getSizeInBits() >= 256);
6683 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6684 // instruction to save 8 or more bytes of constant pool data.
6685 // TODO: If multiple splats are generated to load the same constant,
6686 // it may be detrimental to overall size. There needs to be a way to detect
6687 // that condition to know if this is truly a size win.
6688 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6690 // Handle broadcasting a single constant scalar from the constant pool
6692 // On Sandybridge (no AVX2), it is still better to load a constant vector
6693 // from the constant pool and not to broadcast it from a scalar.
6694 // But override that restriction when optimizing for size.
6695 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6696 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6697 EVT CVT = Ld.getValueType();
6698 assert(!CVT.isVector() && "Must not broadcast a vector type");
6700 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6701 // For size optimization, also splat v2f64 and v2i64, and for size opt
6702 // with AVX2, also splat i8 and i16.
6703 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6704 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6705 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6706 const Constant *C = nullptr;
6707 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6708 C = CI->getConstantIntValue();
6709 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6710 C = CF->getConstantFPValue();
6712 assert(C && "Invalid constant type");
6714 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6716 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6717 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6719 CVT, dl, DAG.getEntryNode(), CP,
6720 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6723 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6727 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6729 // Handle AVX2 in-register broadcasts.
6730 if (!IsLoad && Subtarget.hasInt256() &&
6731 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6732 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6734 // The scalar source must be a normal load.
6738 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6739 (Subtarget.hasVLX() && ScalarSize == 64))
6740 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6742 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6743 // double since there is no vbroadcastsd xmm
6744 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6745 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6746 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6749 // Unsupported broadcast.
6753 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6754 /// underlying vector and index.
6756 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6758 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6760 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6761 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6764 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6766 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6768 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6769 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6772 // In this case the vector is the extract_subvector expression and the index
6773 // is 2, as specified by the shuffle.
6774 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6775 SDValue ShuffleVec = SVOp->getOperand(0);
6776 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6777 assert(ShuffleVecVT.getVectorElementType() ==
6778 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6780 int ShuffleIdx = SVOp->getMaskElt(Idx);
6781 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6782 ExtractedFromVec = ShuffleVec;
6788 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6789 MVT VT = Op.getSimpleValueType();
6791 // Skip if insert_vec_elt is not supported.
6792 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6793 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6797 unsigned NumElems = Op.getNumOperands();
6801 SmallVector<unsigned, 4> InsertIndices;
6802 SmallVector<int, 8> Mask(NumElems, -1);
6804 for (unsigned i = 0; i != NumElems; ++i) {
6805 unsigned Opc = Op.getOperand(i).getOpcode();
6807 if (Opc == ISD::UNDEF)
6810 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6811 // Quit if more than 1 elements need inserting.
6812 if (InsertIndices.size() > 1)
6815 InsertIndices.push_back(i);
6819 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6820 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6822 // Quit if non-constant index.
6823 if (!isa<ConstantSDNode>(ExtIdx))
6825 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6827 // Quit if extracted from vector of different type.
6828 if (ExtractedFromVec.getValueType() != VT)
6831 if (!VecIn1.getNode())
6832 VecIn1 = ExtractedFromVec;
6833 else if (VecIn1 != ExtractedFromVec) {
6834 if (!VecIn2.getNode())
6835 VecIn2 = ExtractedFromVec;
6836 else if (VecIn2 != ExtractedFromVec)
6837 // Quit if more than 2 vectors to shuffle
6841 if (ExtractedFromVec == VecIn1)
6843 else if (ExtractedFromVec == VecIn2)
6844 Mask[i] = Idx + NumElems;
6847 if (!VecIn1.getNode())
6850 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6851 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6853 for (unsigned Idx : InsertIndices)
6854 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6855 DAG.getIntPtrConstant(Idx, DL));
6860 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6861 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6862 Op.getScalarValueSizeInBits() == 1 &&
6863 "Can not convert non-constant vector");
6864 uint64_t Immediate = 0;
6865 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6866 SDValue In = Op.getOperand(idx);
6868 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
6871 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6872 return DAG.getConstant(Immediate, dl, VT);
6874 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6876 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6878 MVT VT = Op.getSimpleValueType();
6879 assert((VT.getVectorElementType() == MVT::i1) &&
6880 "Unexpected type in LowerBUILD_VECTORvXi1!");
6883 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6884 return DAG.getTargetConstant(0, dl, VT);
6886 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6887 return DAG.getTargetConstant(1, dl, VT);
6889 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6890 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6891 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6892 return DAG.getBitcast(VT, Imm);
6893 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6894 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6895 DAG.getIntPtrConstant(0, dl));
6898 // Vector has one or more non-const elements
6899 uint64_t Immediate = 0;
6900 SmallVector<unsigned, 16> NonConstIdx;
6901 bool IsSplat = true;
6902 bool HasConstElts = false;
6904 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6905 SDValue In = Op.getOperand(idx);
6908 if (!isa<ConstantSDNode>(In))
6909 NonConstIdx.push_back(idx);
6911 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
6912 HasConstElts = true;
6916 else if (In != Op.getOperand(SplatIdx))
6920 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6922 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
6923 DAG.getConstant(1, dl, VT),
6924 DAG.getConstant(0, dl, VT));
6926 // insert elements one by one
6930 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6931 Imm = DAG.getConstant(Immediate, dl, ImmVT);
6933 else if (HasConstElts)
6934 Imm = DAG.getConstant(0, dl, VT);
6936 Imm = DAG.getUNDEF(VT);
6937 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6938 DstVec = DAG.getBitcast(VT, Imm);
6940 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6941 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6942 DAG.getIntPtrConstant(0, dl));
6945 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6946 unsigned InsertIdx = NonConstIdx[i];
6947 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6948 Op.getOperand(InsertIdx),
6949 DAG.getIntPtrConstant(InsertIdx, dl));
6954 /// \brief Return true if \p N implements a horizontal binop and return the
6955 /// operands for the horizontal binop into V0 and V1.
6957 /// This is a helper function of LowerToHorizontalOp().
6958 /// This function checks that the build_vector \p N in input implements a
6959 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6960 /// operation to match.
6961 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6962 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6963 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6966 /// This function only analyzes elements of \p N whose indices are
6967 /// in range [BaseIdx, LastIdx).
6968 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6970 unsigned BaseIdx, unsigned LastIdx,
6971 SDValue &V0, SDValue &V1) {
6972 EVT VT = N->getValueType(0);
6974 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6975 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6976 "Invalid Vector in input!");
6978 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6979 bool CanFold = true;
6980 unsigned ExpectedVExtractIdx = BaseIdx;
6981 unsigned NumElts = LastIdx - BaseIdx;
6982 V0 = DAG.getUNDEF(VT);
6983 V1 = DAG.getUNDEF(VT);
6985 // Check if N implements a horizontal binop.
6986 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6987 SDValue Op = N->getOperand(i + BaseIdx);
6990 if (Op->isUndef()) {
6991 // Update the expected vector extract index.
6992 if (i * 2 == NumElts)
6993 ExpectedVExtractIdx = BaseIdx;
6994 ExpectedVExtractIdx += 2;
6998 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7003 SDValue Op0 = Op.getOperand(0);
7004 SDValue Op1 = Op.getOperand(1);
7006 // Try to match the following pattern:
7007 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7008 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7009 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7010 Op0.getOperand(0) == Op1.getOperand(0) &&
7011 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7012 isa<ConstantSDNode>(Op1.getOperand(1)));
7016 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7017 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7019 if (i * 2 < NumElts) {
7021 V0 = Op0.getOperand(0);
7022 if (V0.getValueType() != VT)
7027 V1 = Op0.getOperand(0);
7028 if (V1.getValueType() != VT)
7031 if (i * 2 == NumElts)
7032 ExpectedVExtractIdx = BaseIdx;
7035 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7036 if (I0 == ExpectedVExtractIdx)
7037 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7038 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7039 // Try to match the following dag sequence:
7040 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7041 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7045 ExpectedVExtractIdx += 2;
7051 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7052 /// a concat_vector.
7054 /// This is a helper function of LowerToHorizontalOp().
7055 /// This function expects two 256-bit vectors called V0 and V1.
7056 /// At first, each vector is split into two separate 128-bit vectors.
7057 /// Then, the resulting 128-bit vectors are used to implement two
7058 /// horizontal binary operations.
7060 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7062 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7063 /// the two new horizontal binop.
7064 /// When Mode is set, the first horizontal binop dag node would take as input
7065 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7066 /// horizontal binop dag node would take as input the lower 128-bit of V1
7067 /// and the upper 128-bit of V1.
7069 /// HADD V0_LO, V0_HI
7070 /// HADD V1_LO, V1_HI
7072 /// Otherwise, the first horizontal binop dag node takes as input the lower
7073 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7074 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7076 /// HADD V0_LO, V1_LO
7077 /// HADD V0_HI, V1_HI
7079 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7080 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7081 /// the upper 128-bits of the result.
7082 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7083 const SDLoc &DL, SelectionDAG &DAG,
7084 unsigned X86Opcode, bool Mode,
7085 bool isUndefLO, bool isUndefHI) {
7086 MVT VT = V0.getSimpleValueType();
7087 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7088 "Invalid nodes in input!");
7090 unsigned NumElts = VT.getVectorNumElements();
7091 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7092 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7093 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7094 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7095 MVT NewVT = V0_LO.getSimpleValueType();
7097 SDValue LO = DAG.getUNDEF(NewVT);
7098 SDValue HI = DAG.getUNDEF(NewVT);
7101 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7102 if (!isUndefLO && !V0->isUndef())
7103 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7104 if (!isUndefHI && !V1->isUndef())
7105 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7107 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7108 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7109 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7111 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7112 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7115 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7118 /// Returns true iff \p BV builds a vector with the result equivalent to
7119 /// the result of ADDSUB operation.
7120 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7121 /// are written to the parameters \p Opnd0 and \p Opnd1.
7122 static bool isAddSub(const BuildVectorSDNode *BV,
7123 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7124 SDValue &Opnd0, SDValue &Opnd1) {
7126 MVT VT = BV->getSimpleValueType(0);
7127 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7128 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7129 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7132 unsigned NumElts = VT.getVectorNumElements();
7133 SDValue InVec0 = DAG.getUNDEF(VT);
7134 SDValue InVec1 = DAG.getUNDEF(VT);
7136 // Odd-numbered elements in the input build vector are obtained from
7137 // adding two integer/float elements.
7138 // Even-numbered elements in the input build vector are obtained from
7139 // subtracting two integer/float elements.
7140 unsigned ExpectedOpcode = ISD::FSUB;
7141 unsigned NextExpectedOpcode = ISD::FADD;
7142 bool AddFound = false;
7143 bool SubFound = false;
7145 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7146 SDValue Op = BV->getOperand(i);
7148 // Skip 'undef' values.
7149 unsigned Opcode = Op.getOpcode();
7150 if (Opcode == ISD::UNDEF) {
7151 std::swap(ExpectedOpcode, NextExpectedOpcode);
7155 // Early exit if we found an unexpected opcode.
7156 if (Opcode != ExpectedOpcode)
7159 SDValue Op0 = Op.getOperand(0);
7160 SDValue Op1 = Op.getOperand(1);
7162 // Try to match the following pattern:
7163 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7164 // Early exit if we cannot match that sequence.
7165 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7166 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7167 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7168 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7169 Op0.getOperand(1) != Op1.getOperand(1))
7172 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7176 // We found a valid add/sub node. Update the information accordingly.
7182 // Update InVec0 and InVec1.
7183 if (InVec0.isUndef()) {
7184 InVec0 = Op0.getOperand(0);
7185 if (InVec0.getSimpleValueType() != VT)
7188 if (InVec1.isUndef()) {
7189 InVec1 = Op1.getOperand(0);
7190 if (InVec1.getSimpleValueType() != VT)
7194 // Make sure that operands in input to each add/sub node always
7195 // come from a same pair of vectors.
7196 if (InVec0 != Op0.getOperand(0)) {
7197 if (ExpectedOpcode == ISD::FSUB)
7200 // FADD is commutable. Try to commute the operands
7201 // and then test again.
7202 std::swap(Op0, Op1);
7203 if (InVec0 != Op0.getOperand(0))
7207 if (InVec1 != Op1.getOperand(0))
7210 // Update the pair of expected opcodes.
7211 std::swap(ExpectedOpcode, NextExpectedOpcode);
7214 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7215 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7223 /// Returns true if is possible to fold MUL and an idiom that has already been
7224 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7225 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7226 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7228 /// Prior to calling this function it should be known that there is some
7229 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7230 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7231 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7232 /// of \p Opnd0 uses is expected to be equal to 2.
7233 /// For example, this function may be called for the following IR:
7234 /// %AB = fmul fast <2 x double> %A, %B
7235 /// %Sub = fsub fast <2 x double> %AB, %C
7236 /// %Add = fadd fast <2 x double> %AB, %C
7237 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7238 /// <2 x i32> <i32 0, i32 3>
7239 /// There is a def for %Addsub here, which potentially can be replaced by
7240 /// X86ISD::ADDSUB operation:
7241 /// %Addsub = X86ISD::ADDSUB %AB, %C
7242 /// and such ADDSUB can further be replaced with FMADDSUB:
7243 /// %Addsub = FMADDSUB %A, %B, %C.
7245 /// The main reason why this method is called before the replacement of the
7246 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7247 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7249 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7250 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7251 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7252 !Subtarget.hasAnyFMA())
7255 // FIXME: These checks must match the similar ones in
7256 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7257 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7258 // or MUL + ADDSUB to FMADDSUB.
7259 const TargetOptions &Options = DAG.getTarget().Options;
7261 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7266 Opnd1 = Opnd0.getOperand(1);
7267 Opnd0 = Opnd0.getOperand(0);
7272 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7273 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7274 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7275 const X86Subtarget &Subtarget,
7276 SelectionDAG &DAG) {
7277 SDValue Opnd0, Opnd1;
7278 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7281 MVT VT = BV->getSimpleValueType(0);
7284 // Try to generate X86ISD::FMADDSUB node here.
7286 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7287 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7289 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7290 // the ADDSUB idiom has been successfully recognized. There are no known
7291 // X86 targets with 512-bit ADDSUB instructions!
7292 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7294 if (VT.is512BitVector())
7297 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7300 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7301 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7302 const X86Subtarget &Subtarget,
7303 SelectionDAG &DAG) {
7304 MVT VT = BV->getSimpleValueType(0);
7305 unsigned NumElts = VT.getVectorNumElements();
7306 unsigned NumUndefsLO = 0;
7307 unsigned NumUndefsHI = 0;
7308 unsigned Half = NumElts/2;
7310 // Count the number of UNDEF operands in the build_vector in input.
7311 for (unsigned i = 0, e = Half; i != e; ++i)
7312 if (BV->getOperand(i)->isUndef())
7315 for (unsigned i = Half, e = NumElts; i != e; ++i)
7316 if (BV->getOperand(i)->isUndef())
7319 // Early exit if this is either a build_vector of all UNDEFs or all the
7320 // operands but one are UNDEF.
7321 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7325 SDValue InVec0, InVec1;
7326 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7327 // Try to match an SSE3 float HADD/HSUB.
7328 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7329 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7331 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7332 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7333 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7334 // Try to match an SSSE3 integer HADD/HSUB.
7335 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7336 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7338 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7339 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7342 if (!Subtarget.hasAVX())
7345 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7346 // Try to match an AVX horizontal add/sub of packed single/double
7347 // precision floating point values from 256-bit vectors.
7348 SDValue InVec2, InVec3;
7349 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7350 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7351 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7352 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7353 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7355 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7356 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7357 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7358 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7359 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7360 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7361 // Try to match an AVX2 horizontal add/sub of signed integers.
7362 SDValue InVec2, InVec3;
7364 bool CanFold = true;
7366 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7367 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7368 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7369 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7370 X86Opcode = X86ISD::HADD;
7371 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7372 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7373 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7374 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7375 X86Opcode = X86ISD::HSUB;
7380 // Fold this build_vector into a single horizontal add/sub.
7381 // Do this only if the target has AVX2.
7382 if (Subtarget.hasAVX2())
7383 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7385 // Do not try to expand this build_vector into a pair of horizontal
7386 // add/sub if we can emit a pair of scalar add/sub.
7387 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7390 // Convert this build_vector into a pair of horizontal binop followed by
7392 bool isUndefLO = NumUndefsLO == Half;
7393 bool isUndefHI = NumUndefsHI == Half;
7394 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7395 isUndefLO, isUndefHI);
7399 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7400 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7402 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7403 X86Opcode = X86ISD::HADD;
7404 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7405 X86Opcode = X86ISD::HSUB;
7406 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7407 X86Opcode = X86ISD::FHADD;
7408 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7409 X86Opcode = X86ISD::FHSUB;
7413 // Don't try to expand this build_vector into a pair of horizontal add/sub
7414 // if we can simply emit a pair of scalar add/sub.
7415 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7418 // Convert this build_vector into two horizontal add/sub followed by
7420 bool isUndefLO = NumUndefsLO == Half;
7421 bool isUndefHI = NumUndefsHI == Half;
7422 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7423 isUndefLO, isUndefHI);
7429 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7430 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7431 /// just apply the bit to the vectors.
7432 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7433 /// from this, but enough scalar bit operations are created from the later
7434 /// legalization + scalarization stages to need basic support.
7435 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7436 SelectionDAG &DAG) {
7438 MVT VT = Op->getSimpleValueType(0);
7439 unsigned NumElems = VT.getVectorNumElements();
7440 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7442 // Check that all elements have the same opcode.
7443 // TODO: Should we allow UNDEFS and if so how many?
7444 unsigned Opcode = Op->getOperand(0).getOpcode();
7445 for (unsigned i = 1; i < NumElems; ++i)
7446 if (Opcode != Op->getOperand(i).getOpcode())
7449 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7456 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7461 SmallVector<SDValue, 4> LHSElts, RHSElts;
7462 for (SDValue Elt : Op->ops()) {
7463 SDValue LHS = Elt.getOperand(0);
7464 SDValue RHS = Elt.getOperand(1);
7466 // We expect the canonicalized RHS operand to be the constant.
7467 if (!isa<ConstantSDNode>(RHS))
7469 LHSElts.push_back(LHS);
7470 RHSElts.push_back(RHS);
7473 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7474 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7475 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7478 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7479 /// functionality to do this, so it's all zeros, all ones, or some derivation
7480 /// that is cheap to calculate.
7481 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7482 const X86Subtarget &Subtarget) {
7484 MVT VT = Op.getSimpleValueType();
7486 // Vectors containing all zeros can be matched by pxor and xorps.
7487 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7488 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7489 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7490 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7493 return getZeroVector(VT, Subtarget, DAG, DL);
7496 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7497 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7498 // vpcmpeqd on 256-bit vectors.
7499 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7500 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7501 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7504 return getOnesVector(VT, DAG, DL);
7511 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7514 MVT VT = Op.getSimpleValueType();
7515 MVT ExtVT = VT.getVectorElementType();
7516 unsigned NumElems = Op.getNumOperands();
7518 // Generate vectors for predicate vectors.
7519 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7520 return LowerBUILD_VECTORvXi1(Op, DAG);
7522 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7523 return VectorConstant;
7525 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7526 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7528 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7529 return HorizontalOp;
7530 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7532 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7535 unsigned EVTBits = ExtVT.getSizeInBits();
7537 unsigned NumZero = 0;
7538 unsigned NumNonZero = 0;
7539 uint64_t NonZeros = 0;
7540 bool IsAllConstants = true;
7541 SmallSet<SDValue, 8> Values;
7542 for (unsigned i = 0; i < NumElems; ++i) {
7543 SDValue Elt = Op.getOperand(i);
7547 if (Elt.getOpcode() != ISD::Constant &&
7548 Elt.getOpcode() != ISD::ConstantFP)
7549 IsAllConstants = false;
7550 if (X86::isZeroNode(Elt))
7553 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7554 NonZeros |= ((uint64_t)1 << i);
7559 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7560 if (NumNonZero == 0)
7561 return DAG.getUNDEF(VT);
7563 // Special case for single non-zero, non-undef, element.
7564 if (NumNonZero == 1) {
7565 unsigned Idx = countTrailingZeros(NonZeros);
7566 SDValue Item = Op.getOperand(Idx);
7568 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7569 // the value are obviously zero, truncate the value to i32 and do the
7570 // insertion that way. Only do this if the value is non-constant or if the
7571 // value is a constant being inserted into element 0. It is cheaper to do
7572 // a constant pool load than it is to do a movd + shuffle.
7573 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7574 (!IsAllConstants || Idx == 0)) {
7575 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7577 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7578 MVT VecVT = MVT::v4i32;
7580 // Truncate the value (which may itself be a constant) to i32, and
7581 // convert it to a vector with movd (S2V+shuffle to zero extend).
7582 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7583 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7584 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7585 Item, Idx * 2, true, Subtarget, DAG));
7589 // If we have a constant or non-constant insertion into the low element of
7590 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7591 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7592 // depending on what the source datatype is.
7595 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7597 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7598 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7599 assert((VT.is128BitVector() || VT.is256BitVector() ||
7600 VT.is512BitVector()) &&
7601 "Expected an SSE value type!");
7602 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7603 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7604 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7607 // We can't directly insert an i8 or i16 into a vector, so zero extend
7609 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7610 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7611 if (VT.getSizeInBits() >= 256) {
7612 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7613 if (Subtarget.hasAVX()) {
7614 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7615 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7617 // Without AVX, we need to extend to a 128-bit vector and then
7618 // insert into the 256-bit vector.
7619 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7620 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7621 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7624 assert(VT.is128BitVector() && "Expected an SSE value type!");
7625 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7626 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7628 return DAG.getBitcast(VT, Item);
7632 // Is it a vector logical left shift?
7633 if (NumElems == 2 && Idx == 1 &&
7634 X86::isZeroNode(Op.getOperand(0)) &&
7635 !X86::isZeroNode(Op.getOperand(1))) {
7636 unsigned NumBits = VT.getSizeInBits();
7637 return getVShift(true, VT,
7638 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7639 VT, Op.getOperand(1)),
7640 NumBits/2, DAG, *this, dl);
7643 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7646 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7647 // is a non-constant being inserted into an element other than the low one,
7648 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7649 // movd/movss) to move this into the low element, then shuffle it into
7651 if (EVTBits == 32) {
7652 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7653 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7657 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7658 if (Values.size() == 1) {
7659 if (EVTBits == 32) {
7660 // Instead of a shuffle like this:
7661 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7662 // Check if it's possible to issue this instead.
7663 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7664 unsigned Idx = countTrailingZeros(NonZeros);
7665 SDValue Item = Op.getOperand(Idx);
7666 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7667 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7672 // A vector full of immediates; various special cases are already
7673 // handled, so this is best done with a single constant-pool load.
7677 // See if we can use a vector load to get all of the elements.
7678 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7679 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7680 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7684 // For AVX-length vectors, build the individual 128-bit pieces and use
7685 // shuffles to put them in place.
7686 if (VT.is256BitVector() || VT.is512BitVector()) {
7687 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7689 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7691 // Build both the lower and upper subvector.
7693 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7694 SDValue Upper = DAG.getBuildVector(
7695 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7697 // Recreate the wider vector with the lower and upper part.
7698 if (VT.is256BitVector())
7699 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7700 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7703 // Let legalizer expand 2-wide build_vectors.
7704 if (EVTBits == 64) {
7705 if (NumNonZero == 1) {
7706 // One half is zero or undef.
7707 unsigned Idx = countTrailingZeros(NonZeros);
7708 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7709 Op.getOperand(Idx));
7710 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7715 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7716 if (EVTBits == 8 && NumElems == 16)
7717 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7721 if (EVTBits == 16 && NumElems == 8)
7722 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7726 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7727 if (EVTBits == 32 && NumElems == 4)
7728 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7731 // If element VT is == 32 bits, turn it into a number of shuffles.
7732 if (NumElems == 4 && NumZero > 0) {
7733 SmallVector<SDValue, 8> Ops(NumElems);
7734 for (unsigned i = 0; i < 4; ++i) {
7735 bool isZero = !(NonZeros & (1ULL << i));
7737 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7739 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7742 for (unsigned i = 0; i < 2; ++i) {
7743 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7746 Ops[i] = Ops[i*2]; // Must be a zero vector.
7749 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7752 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7755 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7760 bool Reverse1 = (NonZeros & 0x3) == 2;
7761 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7765 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7766 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7768 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7771 if (Values.size() > 1 && VT.is128BitVector()) {
7772 // Check for a build vector from mostly shuffle plus few inserting.
7773 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7776 // For SSE 4.1, use insertps to put the high elements into the low element.
7777 if (Subtarget.hasSSE41()) {
7779 if (!Op.getOperand(0).isUndef())
7780 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7782 Result = DAG.getUNDEF(VT);
7784 for (unsigned i = 1; i < NumElems; ++i) {
7785 if (Op.getOperand(i).isUndef()) continue;
7786 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7787 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7792 // Otherwise, expand into a number of unpckl*, start by extending each of
7793 // our (non-undef) elements to the full vector width with the element in the
7794 // bottom slot of the vector (which generates no code for SSE).
7795 SmallVector<SDValue, 8> Ops(NumElems);
7796 for (unsigned i = 0; i < NumElems; ++i) {
7797 if (!Op.getOperand(i).isUndef())
7798 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7800 Ops[i] = DAG.getUNDEF(VT);
7803 // Next, we iteratively mix elements, e.g. for v4f32:
7804 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7805 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7806 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7807 unsigned EltStride = NumElems >> 1;
7808 while (EltStride != 0) {
7809 for (unsigned i = 0; i < EltStride; ++i) {
7810 // If Ops[i+EltStride] is undef and this is the first round of mixing,
7811 // then it is safe to just drop this shuffle: V[i] is already in the
7812 // right place, the one element (since it's the first round) being
7813 // inserted as undef can be dropped. This isn't safe for successive
7814 // rounds because they will permute elements within both vectors.
7815 if (Ops[i+EltStride].isUndef() &&
7816 EltStride == NumElems/2)
7819 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
7828 // 256-bit AVX can use the vinsertf128 instruction
7829 // to create 256-bit vectors from two other 128-bit ones.
7830 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7832 MVT ResVT = Op.getSimpleValueType();
7834 assert((ResVT.is256BitVector() ||
7835 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7837 SDValue V1 = Op.getOperand(0);
7838 SDValue V2 = Op.getOperand(1);
7839 unsigned NumElems = ResVT.getVectorNumElements();
7840 if (ResVT.is256BitVector())
7841 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7843 if (Op.getNumOperands() == 4) {
7844 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7845 ResVT.getVectorNumElements()/2);
7846 SDValue V3 = Op.getOperand(2);
7847 SDValue V4 = Op.getOperand(3);
7848 return concat256BitVectors(
7849 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7850 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7853 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7856 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7857 const X86Subtarget &Subtarget,
7858 SelectionDAG & DAG) {
7860 MVT ResVT = Op.getSimpleValueType();
7861 unsigned NumOfOperands = Op.getNumOperands();
7863 assert(isPowerOf2_32(NumOfOperands) &&
7864 "Unexpected number of operands in CONCAT_VECTORS");
7866 SDValue Undef = DAG.getUNDEF(ResVT);
7867 if (NumOfOperands > 2) {
7868 // Specialize the cases when all, or all but one, of the operands are undef.
7869 unsigned NumOfDefinedOps = 0;
7871 for (unsigned i = 0; i < NumOfOperands; i++)
7872 if (!Op.getOperand(i).isUndef()) {
7876 if (NumOfDefinedOps == 0)
7878 if (NumOfDefinedOps == 1) {
7879 unsigned SubVecNumElts =
7880 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7881 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7882 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7883 Op.getOperand(OpIdx), IdxVal);
7886 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7887 ResVT.getVectorNumElements()/2);
7888 SmallVector<SDValue, 2> Ops;
7889 for (unsigned i = 0; i < NumOfOperands/2; i++)
7890 Ops.push_back(Op.getOperand(i));
7891 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7893 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
7894 Ops.push_back(Op.getOperand(i));
7895 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7896 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
7900 SDValue V1 = Op.getOperand(0);
7901 SDValue V2 = Op.getOperand(1);
7902 unsigned NumElems = ResVT.getVectorNumElements();
7903 assert(V1.getValueType() == V2.getValueType() &&
7904 V1.getValueType().getVectorNumElements() == NumElems/2 &&
7905 "Unexpected operands in CONCAT_VECTORS");
7907 if (ResVT.getSizeInBits() >= 16)
7908 return Op; // The operation is legal with KUNPCK
7910 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
7911 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
7912 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
7913 if (IsZeroV1 && IsZeroV2)
7916 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
7918 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7920 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
7922 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
7924 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
7927 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
7929 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7930 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7933 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7934 const X86Subtarget &Subtarget,
7935 SelectionDAG &DAG) {
7936 MVT VT = Op.getSimpleValueType();
7937 if (VT.getVectorElementType() == MVT::i1)
7938 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7940 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7941 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7942 Op.getNumOperands() == 4)));
7944 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7945 // from two other 128-bit ones.
7947 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7948 return LowerAVXCONCAT_VECTORS(Op, DAG);
7951 //===----------------------------------------------------------------------===//
7952 // Vector shuffle lowering
7954 // This is an experimental code path for lowering vector shuffles on x86. It is
7955 // designed to handle arbitrary vector shuffles and blends, gracefully
7956 // degrading performance as necessary. It works hard to recognize idiomatic
7957 // shuffles and lower them to optimal instruction patterns without leaving
7958 // a framework that allows reasonably efficient handling of all vector shuffle
7960 //===----------------------------------------------------------------------===//
7962 /// \brief Tiny helper function to identify a no-op mask.
7964 /// This is a somewhat boring predicate function. It checks whether the mask
7965 /// array input, which is assumed to be a single-input shuffle mask of the kind
7966 /// used by the X86 shuffle instructions (not a fully general
7967 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7968 /// in-place shuffle are 'no-op's.
7969 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7970 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7971 assert(Mask[i] >= -1 && "Out of bound mask element!");
7972 if (Mask[i] >= 0 && Mask[i] != i)
7978 /// \brief Test whether there are elements crossing 128-bit lanes in this
7981 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7982 /// and we routinely test for these.
7983 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7984 int LaneSize = 128 / VT.getScalarSizeInBits();
7985 int Size = Mask.size();
7986 for (int i = 0; i < Size; ++i)
7987 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7992 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7994 /// This checks a shuffle mask to see if it is performing the same
7995 /// lane-relative shuffle in each sub-lane. This trivially implies
7996 /// that it is also not lane-crossing. It may however involve a blend from the
7997 /// same lane of a second vector.
7999 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8000 /// non-trivial to compute in the face of undef lanes. The representation is
8001 /// suitable for use with existing 128-bit shuffles as entries from the second
8002 /// vector have been remapped to [LaneSize, 2*LaneSize).
8003 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8005 SmallVectorImpl<int> &RepeatedMask) {
8006 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8007 RepeatedMask.assign(LaneSize, -1);
8008 int Size = Mask.size();
8009 for (int i = 0; i < Size; ++i) {
8010 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8013 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8014 // This entry crosses lanes, so there is no way to model this shuffle.
8017 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8018 // Adjust second vector indices to start at LaneSize instead of Size.
8019 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8020 : Mask[i] % LaneSize + LaneSize;
8021 if (RepeatedMask[i % LaneSize] < 0)
8022 // This is the first non-undef entry in this slot of a 128-bit lane.
8023 RepeatedMask[i % LaneSize] = LocalM;
8024 else if (RepeatedMask[i % LaneSize] != LocalM)
8025 // Found a mismatch with the repeated mask.
8031 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8033 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8034 SmallVectorImpl<int> &RepeatedMask) {
8035 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8038 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8040 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8041 SmallVectorImpl<int> &RepeatedMask) {
8042 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8045 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8046 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8047 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8049 SmallVectorImpl<int> &RepeatedMask) {
8050 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8051 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8052 int Size = Mask.size();
8053 for (int i = 0; i < Size; ++i) {
8054 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8055 if (Mask[i] == SM_SentinelUndef)
8057 if (Mask[i] == SM_SentinelZero) {
8058 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8060 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8063 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8064 // This entry crosses lanes, so there is no way to model this shuffle.
8067 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8068 // Adjust second vector indices to start at LaneSize instead of Size.
8070 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8071 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8072 // This is the first non-undef entry in this slot of a 128-bit lane.
8073 RepeatedMask[i % LaneSize] = LocalM;
8074 else if (RepeatedMask[i % LaneSize] != LocalM)
8075 // Found a mismatch with the repeated mask.
8081 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8084 /// This is a fast way to test a shuffle mask against a fixed pattern:
8086 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8088 /// It returns true if the mask is exactly as wide as the argument list, and
8089 /// each element of the mask is either -1 (signifying undef) or the value given
8090 /// in the argument.
8091 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8092 ArrayRef<int> ExpectedMask) {
8093 if (Mask.size() != ExpectedMask.size())
8096 int Size = Mask.size();
8098 // If the values are build vectors, we can look through them to find
8099 // equivalent inputs that make the shuffles equivalent.
8100 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8101 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8103 for (int i = 0; i < Size; ++i) {
8104 assert(Mask[i] >= -1 && "Out of bound mask element!");
8105 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8106 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8107 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8108 if (!MaskBV || !ExpectedBV ||
8109 MaskBV->getOperand(Mask[i] % Size) !=
8110 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8118 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8120 /// The masks must be exactly the same width.
8122 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8123 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8125 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8126 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8127 ArrayRef<int> ExpectedMask) {
8128 int Size = Mask.size();
8129 if (Size != (int)ExpectedMask.size())
8132 for (int i = 0; i < Size; ++i)
8133 if (Mask[i] == SM_SentinelUndef)
8135 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8137 else if (Mask[i] != ExpectedMask[i])
8143 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8145 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8146 const APInt &Zeroable) {
8147 int NumElts = Mask.size();
8148 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8150 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8151 for (int i = 0; i != NumElts; ++i) {
8153 if (M == SM_SentinelUndef)
8155 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8156 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8161 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8163 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8164 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8167 SmallVector<int, 8> Unpcklwd;
8168 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8169 /* Unary = */ false);
8170 SmallVector<int, 8> Unpckhwd;
8171 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8172 /* Unary = */ false);
8173 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8174 isTargetShuffleEquivalent(Mask, Unpckhwd));
8175 return IsUnpackwdMask;
8178 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8180 /// This helper function produces an 8-bit shuffle immediate corresponding to
8181 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8182 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8185 /// NB: We rely heavily on "undef" masks preserving the input lane.
8186 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8187 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8188 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8189 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8190 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8191 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8194 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8195 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8196 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8197 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8201 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8202 SelectionDAG &DAG) {
8203 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8206 /// \brief Compute whether each element of a shuffle is zeroable.
8208 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8209 /// Either it is an undef element in the shuffle mask, the element of the input
8210 /// referenced is undef, or the element of the input referenced is known to be
8211 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8212 /// as many lanes with this technique as possible to simplify the remaining
8214 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8215 SDValue V1, SDValue V2) {
8216 APInt Zeroable(Mask.size(), 0);
8217 V1 = peekThroughBitcasts(V1);
8218 V2 = peekThroughBitcasts(V2);
8220 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8221 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8223 int VectorSizeInBits = V1.getValueSizeInBits();
8224 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8225 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8227 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8229 // Handle the easy cases.
8230 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8235 // Determine shuffle input and normalize the mask.
8236 SDValue V = M < Size ? V1 : V2;
8239 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8240 if (V.getOpcode() != ISD::BUILD_VECTOR)
8243 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8244 // the (larger) source element must be UNDEF/ZERO.
8245 if ((Size % V.getNumOperands()) == 0) {
8246 int Scale = Size / V->getNumOperands();
8247 SDValue Op = V.getOperand(M / Scale);
8248 if (Op.isUndef() || X86::isZeroNode(Op))
8250 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8251 APInt Val = Cst->getAPIntValue();
8252 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8253 Val = Val.getLoBits(ScalarSizeInBits);
8256 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8257 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8258 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8259 Val = Val.getLoBits(ScalarSizeInBits);
8266 // If the BUILD_VECTOR has more elements then all the (smaller) source
8267 // elements must be UNDEF or ZERO.
8268 if ((V.getNumOperands() % Size) == 0) {
8269 int Scale = V->getNumOperands() / Size;
8270 bool AllZeroable = true;
8271 for (int j = 0; j < Scale; ++j) {
8272 SDValue Op = V.getOperand((M * Scale) + j);
8273 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8284 // The Shuffle result is as follow:
8285 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8286 // Each Zeroable's element correspond to a particular Mask's element.
8287 // As described in computeZeroableShuffleElements function.
8289 // The function looks for a sub-mask that the nonzero elements are in
8290 // increasing order. If such sub-mask exist. The function returns true.
8291 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8292 ArrayRef<int> Mask, const EVT &VectorType,
8293 bool &IsZeroSideLeft) {
8294 int NextElement = -1;
8295 // Check if the Mask's nonzero elements are in increasing order.
8296 for (int i = 0, e = Mask.size(); i < e; i++) {
8297 // Checks if the mask's zeros elements are built from only zeros.
8298 assert(Mask[i] >= -1 && "Out of bound mask element!");
8303 // Find the lowest non zero element
8304 if (NextElement < 0) {
8305 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8306 IsZeroSideLeft = NextElement != 0;
8308 // Exit if the mask's non zero elements are not in increasing order.
8309 if (NextElement != Mask[i])
8316 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8317 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8318 ArrayRef<int> Mask, SDValue V1,
8320 const APInt &Zeroable,
8321 const X86Subtarget &Subtarget,
8322 SelectionDAG &DAG) {
8323 int Size = Mask.size();
8324 int LaneSize = 128 / VT.getScalarSizeInBits();
8325 const int NumBytes = VT.getSizeInBits() / 8;
8326 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8328 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8329 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8330 (Subtarget.hasBWI() && VT.is512BitVector()));
8332 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8333 // Sign bit set in i8 mask means zero element.
8334 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8337 for (int i = 0; i < NumBytes; ++i) {
8338 int M = Mask[i / NumEltBytes];
8340 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8343 if (Zeroable[i / NumEltBytes]) {
8344 PSHUFBMask[i] = ZeroMask;
8348 // We can only use a single input of V1 or V2.
8349 SDValue SrcV = (M >= Size ? V2 : V1);
8355 // PSHUFB can't cross lanes, ensure this doesn't happen.
8356 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8360 M = M * NumEltBytes + (i % NumEltBytes);
8361 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8363 assert(V && "Failed to find a source input");
8365 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8366 return DAG.getBitcast(
8367 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8368 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8371 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8372 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8375 // X86 has dedicated shuffle that can be lowered to VEXPAND
8376 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8377 const APInt &Zeroable,
8378 ArrayRef<int> Mask, SDValue &V1,
8379 SDValue &V2, SelectionDAG &DAG,
8380 const X86Subtarget &Subtarget) {
8381 bool IsLeftZeroSide = true;
8382 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8385 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8387 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8388 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8389 unsigned NumElts = VT.getVectorNumElements();
8390 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8391 "Unexpected number of vector elements");
8392 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8393 Subtarget, DAG, DL);
8394 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8395 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8396 return DAG.getSelect(DL, VT, VMask,
8397 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8401 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8402 unsigned &UnpackOpcode, bool IsUnary,
8403 ArrayRef<int> TargetMask, SDLoc &DL,
8405 const X86Subtarget &Subtarget) {
8406 int NumElts = VT.getVectorNumElements();
8408 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8409 for (int i = 0; i != NumElts; i += 2) {
8410 int M1 = TargetMask[i + 0];
8411 int M2 = TargetMask[i + 1];
8412 Undef1 &= (SM_SentinelUndef == M1);
8413 Undef2 &= (SM_SentinelUndef == M2);
8414 Zero1 &= isUndefOrZero(M1);
8415 Zero2 &= isUndefOrZero(M2);
8417 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8418 "Zeroable shuffle detected");
8420 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8421 SmallVector<int, 64> Unpckl, Unpckh;
8422 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8423 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8424 UnpackOpcode = X86ISD::UNPCKL;
8425 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8426 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8430 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8431 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8432 UnpackOpcode = X86ISD::UNPCKH;
8433 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8434 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8438 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8439 if (IsUnary && (Zero1 || Zero2)) {
8440 // Don't bother if we can blend instead.
8441 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8442 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8445 bool MatchLo = true, MatchHi = true;
8446 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8447 int M = TargetMask[i];
8449 // Ignore if the input is known to be zero or the index is undef.
8450 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8451 (M == SM_SentinelUndef))
8454 MatchLo &= (M == Unpckl[i]);
8455 MatchHi &= (M == Unpckh[i]);
8458 if (MatchLo || MatchHi) {
8459 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8460 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8461 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8466 // If a binary shuffle, commute and try again.
8468 ShuffleVectorSDNode::commuteMask(Unpckl);
8469 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8470 UnpackOpcode = X86ISD::UNPCKL;
8475 ShuffleVectorSDNode::commuteMask(Unpckh);
8476 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8477 UnpackOpcode = X86ISD::UNPCKH;
8486 // X86 has dedicated unpack instructions that can handle specific blend
8487 // operations: UNPCKH and UNPCKL.
8488 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8489 ArrayRef<int> Mask, SDValue V1,
8490 SDValue V2, SelectionDAG &DAG) {
8491 SmallVector<int, 8> Unpckl;
8492 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8493 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8494 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8496 SmallVector<int, 8> Unpckh;
8497 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8498 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8499 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8501 // Commute and try again.
8502 ShuffleVectorSDNode::commuteMask(Unpckl);
8503 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8504 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8506 ShuffleVectorSDNode::commuteMask(Unpckh);
8507 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8508 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8513 /// \brief Try to emit a bitmask instruction for a shuffle.
8515 /// This handles cases where we can model a blend exactly as a bitmask due to
8516 /// one of the inputs being zeroable.
8517 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8518 SDValue V2, ArrayRef<int> Mask,
8519 const APInt &Zeroable,
8520 SelectionDAG &DAG) {
8521 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8522 MVT EltVT = VT.getVectorElementType();
8523 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8524 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8525 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8527 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8530 if (Mask[i] % Size != i)
8531 return SDValue(); // Not a blend.
8533 V = Mask[i] < Size ? V1 : V2;
8534 else if (V != (Mask[i] < Size ? V1 : V2))
8535 return SDValue(); // Can only let one input through the mask.
8537 VMaskOps[i] = AllOnes;
8540 return SDValue(); // No non-zeroable elements!
8542 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8543 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8546 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8548 /// This is used as a fallback approach when first class blend instructions are
8549 /// unavailable. Currently it is only suitable for integer vectors, but could
8550 /// be generalized for floating point vectors if desirable.
8551 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8552 SDValue V2, ArrayRef<int> Mask,
8553 SelectionDAG &DAG) {
8554 assert(VT.isInteger() && "Only supports integer vector types!");
8555 MVT EltVT = VT.getVectorElementType();
8556 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8557 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8558 SmallVector<SDValue, 16> MaskOps;
8559 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8560 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8561 return SDValue(); // Shuffled input!
8562 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8565 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8566 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8567 // We have to cast V2 around.
8568 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8569 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8570 DAG.getBitcast(MaskVT, V1Mask),
8571 DAG.getBitcast(MaskVT, V2)));
8572 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8575 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8576 SDValue PreservedSrc,
8577 const X86Subtarget &Subtarget,
8580 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8581 MutableArrayRef<int> TargetMask,
8582 bool &ForceV1Zero, bool &ForceV2Zero,
8583 uint64_t &BlendMask) {
8584 bool V1IsZeroOrUndef =
8585 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8586 bool V2IsZeroOrUndef =
8587 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8590 ForceV1Zero = false, ForceV2Zero = false;
8591 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8593 // Attempt to generate the binary blend mask. If an input is zero then
8594 // we can use any lane.
8595 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8596 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8597 int M = TargetMask[i];
8598 if (M == SM_SentinelUndef)
8602 if (M == i + Size) {
8603 BlendMask |= 1ull << i;
8606 if (M == SM_SentinelZero) {
8607 if (V1IsZeroOrUndef) {
8612 if (V2IsZeroOrUndef) {
8614 BlendMask |= 1ull << i;
8615 TargetMask[i] = i + Size;
8624 uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8625 uint64_t ScaledMask = 0;
8626 for (int i = 0; i != Size; ++i)
8627 if (BlendMask & (1ull << i))
8628 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8632 /// \brief Try to emit a blend instruction for a shuffle.
8634 /// This doesn't do any checks for the availability of instructions for blending
8635 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8636 /// be matched in the backend with the type given. What it does check for is
8637 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8638 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8639 SDValue V2, ArrayRef<int> Original,
8640 const APInt &Zeroable,
8641 const X86Subtarget &Subtarget,
8642 SelectionDAG &DAG) {
8643 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8645 uint64_t BlendMask = 0;
8646 bool ForceV1Zero = false, ForceV2Zero = false;
8647 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8651 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8653 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8655 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8657 switch (VT.SimpleTy) {
8662 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8663 DAG.getConstant(BlendMask, DL, MVT::i8));
8667 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8671 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8672 // that instruction.
8673 if (Subtarget.hasAVX2()) {
8674 // Scale the blend by the number of 32-bit dwords per element.
8675 int Scale = VT.getScalarSizeInBits() / 32;
8676 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8677 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8678 V1 = DAG.getBitcast(BlendVT, V1);
8679 V2 = DAG.getBitcast(BlendVT, V2);
8680 return DAG.getBitcast(
8681 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8682 DAG.getConstant(BlendMask, DL, MVT::i8)));
8686 // For integer shuffles we need to expand the mask and cast the inputs to
8687 // v8i16s prior to blending.
8688 int Scale = 8 / VT.getVectorNumElements();
8689 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8690 V1 = DAG.getBitcast(MVT::v8i16, V1);
8691 V2 = DAG.getBitcast(MVT::v8i16, V2);
8692 return DAG.getBitcast(VT,
8693 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8694 DAG.getConstant(BlendMask, DL, MVT::i8)));
8698 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8699 SmallVector<int, 8> RepeatedMask;
8700 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8701 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8702 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8704 for (int i = 0; i < 8; ++i)
8705 if (RepeatedMask[i] >= 8)
8706 BlendMask |= 1ull << i;
8707 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8708 DAG.getConstant(BlendMask, DL, MVT::i8));
8714 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8715 "256-bit byte-blends require AVX2 support!");
8717 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8719 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8720 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8721 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8724 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8725 if (SDValue Masked =
8726 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8729 // Scale the blend by the number of bytes per element.
8730 int Scale = VT.getScalarSizeInBits() / 8;
8732 // This form of blend is always done on bytes. Compute the byte vector
8734 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8736 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8737 // mix of LLVM's code generator and the x86 backend. We tell the code
8738 // generator that boolean values in the elements of an x86 vector register
8739 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8740 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8741 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8742 // of the element (the remaining are ignored) and 0 in that high bit would
8743 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8744 // the LLVM model for boolean values in vector elements gets the relevant
8745 // bit set, it is set backwards and over constrained relative to x86's
8747 SmallVector<SDValue, 32> VSELECTMask;
8748 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8749 for (int j = 0; j < Scale; ++j)
8750 VSELECTMask.push_back(
8751 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8752 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8755 V1 = DAG.getBitcast(BlendVT, V1);
8756 V2 = DAG.getBitcast(BlendVT, V2);
8757 return DAG.getBitcast(
8759 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
8769 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8770 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8771 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8774 llvm_unreachable("Not a supported integer vector type!");
8778 /// \brief Try to lower as a blend of elements from two inputs followed by
8779 /// a single-input permutation.
8781 /// This matches the pattern where we can blend elements from two inputs and
8782 /// then reduce the shuffle to a single-input permutation.
8783 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8784 SDValue V1, SDValue V2,
8786 SelectionDAG &DAG) {
8787 // We build up the blend mask while checking whether a blend is a viable way
8788 // to reduce the shuffle.
8789 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8790 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8792 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8796 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8798 if (BlendMask[Mask[i] % Size] < 0)
8799 BlendMask[Mask[i] % Size] = Mask[i];
8800 else if (BlendMask[Mask[i] % Size] != Mask[i])
8801 return SDValue(); // Can't blend in the needed input!
8803 PermuteMask[i] = Mask[i] % Size;
8806 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8807 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8810 /// \brief Generic routine to decompose a shuffle and blend into independent
8811 /// blends and permutes.
8813 /// This matches the extremely common pattern for handling combined
8814 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8815 /// operations. It will try to pick the best arrangement of shuffles and
8817 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8821 SelectionDAG &DAG) {
8822 // Shuffle the input elements into the desired positions in V1 and V2 and
8823 // blend them together.
8824 SmallVector<int, 32> V1Mask(Mask.size(), -1);
8825 SmallVector<int, 32> V2Mask(Mask.size(), -1);
8826 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8827 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8828 if (Mask[i] >= 0 && Mask[i] < Size) {
8829 V1Mask[i] = Mask[i];
8831 } else if (Mask[i] >= Size) {
8832 V2Mask[i] = Mask[i] - Size;
8833 BlendMask[i] = i + Size;
8836 // Try to lower with the simpler initial blend strategy unless one of the
8837 // input shuffles would be a no-op. We prefer to shuffle inputs as the
8838 // shuffle may be able to fold with a load or other benefit. However, when
8839 // we'll have to do 2x as many shuffles in order to achieve this, blending
8840 // first is a better strategy.
8841 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
8842 if (SDValue BlendPerm =
8843 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8846 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8847 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8848 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8851 /// \brief Try to lower a vector shuffle as a rotation.
8853 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8854 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8855 ArrayRef<int> Mask) {
8856 int NumElts = Mask.size();
8858 // We need to detect various ways of spelling a rotation:
8859 // [11, 12, 13, 14, 15, 0, 1, 2]
8860 // [-1, 12, 13, 14, -1, -1, 1, -1]
8861 // [-1, -1, -1, -1, -1, -1, 1, 2]
8862 // [ 3, 4, 5, 6, 7, 8, 9, 10]
8863 // [-1, 4, 5, 6, -1, -1, 9, -1]
8864 // [-1, 4, 5, 6, -1, -1, -1, -1]
8867 for (int i = 0; i < NumElts; ++i) {
8869 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8870 "Unexpected mask index.");
8874 // Determine where a rotated vector would have started.
8875 int StartIdx = i - (M % NumElts);
8877 // The identity rotation isn't interesting, stop.
8880 // If we found the tail of a vector the rotation must be the missing
8881 // front. If we found the head of a vector, it must be how much of the
8883 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
8886 Rotation = CandidateRotation;
8887 else if (Rotation != CandidateRotation)
8888 // The rotations don't match, so we can't match this mask.
8891 // Compute which value this mask is pointing at.
8892 SDValue MaskV = M < NumElts ? V1 : V2;
8894 // Compute which of the two target values this index should be assigned
8895 // to. This reflects whether the high elements are remaining or the low
8896 // elements are remaining.
8897 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
8899 // Either set up this value if we've not encountered it before, or check
8900 // that it remains consistent.
8903 else if (TargetV != MaskV)
8904 // This may be a rotation, but it pulls from the inputs in some
8905 // unsupported interleaving.
8909 // Check that we successfully analyzed the mask, and normalize the results.
8910 assert(Rotation != 0 && "Failed to locate a viable rotation!");
8911 assert((Lo || Hi) && "Failed to find a rotated input vector!");
8923 /// \brief Try to lower a vector shuffle as a byte rotation.
8925 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
8926 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
8927 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
8928 /// try to generically lower a vector shuffle through such an pattern. It
8929 /// does not check for the profitability of lowering either as PALIGNR or
8930 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
8931 /// This matches shuffle vectors that look like:
8933 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
8935 /// Essentially it concatenates V1 and V2, shifts right by some number of
8936 /// elements, and takes the low elements as the result. Note that while this is
8937 /// specified as a *right shift* because x86 is little-endian, it is a *left
8938 /// rotate* of the vector lanes.
8939 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
8940 ArrayRef<int> Mask) {
8941 // Don't accept any shuffles with zero elements.
8942 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
8945 // PALIGNR works on 128-bit lanes.
8946 SmallVector<int, 16> RepeatedMask;
8947 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
8950 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
8954 // PALIGNR rotates bytes, so we need to scale the
8955 // rotation based on how many bytes are in the vector lane.
8956 int NumElts = RepeatedMask.size();
8957 int Scale = 16 / NumElts;
8958 return Rotation * Scale;
8961 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
8962 SDValue V1, SDValue V2,
8964 const X86Subtarget &Subtarget,
8965 SelectionDAG &DAG) {
8966 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
8968 SDValue Lo = V1, Hi = V2;
8969 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
8970 if (ByteRotation <= 0)
8973 // Cast the inputs to i8 vector of correct length to match PALIGNR or
8975 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8976 Lo = DAG.getBitcast(ByteVT, Lo);
8977 Hi = DAG.getBitcast(ByteVT, Hi);
8979 // SSSE3 targets can use the palignr instruction.
8980 if (Subtarget.hasSSSE3()) {
8981 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
8982 "512-bit PALIGNR requires BWI instructions");
8983 return DAG.getBitcast(
8984 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
8985 DAG.getConstant(ByteRotation, DL, MVT::i8)));
8988 assert(VT.is128BitVector() &&
8989 "Rotate-based lowering only supports 128-bit lowering!");
8990 assert(Mask.size() <= 16 &&
8991 "Can shuffle at most 16 bytes in a 128-bit vector!");
8992 assert(ByteVT == MVT::v16i8 &&
8993 "SSE2 rotate lowering only needed for v16i8!");
8995 // Default SSE2 implementation
8996 int LoByteShift = 16 - ByteRotation;
8997 int HiByteShift = ByteRotation;
8999 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9000 DAG.getConstant(LoByteShift, DL, MVT::i8));
9001 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9002 DAG.getConstant(HiByteShift, DL, MVT::i8));
9003 return DAG.getBitcast(VT,
9004 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9007 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9009 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9010 /// rotation of the concatenation of two vectors; This routine will
9011 /// try to generically lower a vector shuffle through such an pattern.
9013 /// Essentially it concatenates V1 and V2, shifts right by some number of
9014 /// elements, and takes the low elements as the result. Note that while this is
9015 /// specified as a *right shift* because x86 is little-endian, it is a *left
9016 /// rotate* of the vector lanes.
9017 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9018 SDValue V1, SDValue V2,
9020 const X86Subtarget &Subtarget,
9021 SelectionDAG &DAG) {
9022 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9023 "Only 32-bit and 64-bit elements are supported!");
9025 // 128/256-bit vectors are only supported with VLX.
9026 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9027 && "VLX required for 128/256-bit vectors");
9029 SDValue Lo = V1, Hi = V2;
9030 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9034 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9035 DAG.getConstant(Rotation, DL, MVT::i8));
9038 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9040 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9041 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9042 /// matches elements from one of the input vectors shuffled to the left or
9043 /// right with zeroable elements 'shifted in'. It handles both the strictly
9044 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9047 /// PSHL : (little-endian) left bit shift.
9048 /// [ zz, 0, zz, 2 ]
9049 /// [ -1, 4, zz, -1 ]
9050 /// PSRL : (little-endian) right bit shift.
9052 /// [ -1, -1, 7, zz]
9053 /// PSLLDQ : (little-endian) left byte shift
9054 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9055 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9056 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9057 /// PSRLDQ : (little-endian) right byte shift
9058 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9059 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9060 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9061 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9062 unsigned ScalarSizeInBits,
9063 ArrayRef<int> Mask, int MaskOffset,
9064 const APInt &Zeroable,
9065 const X86Subtarget &Subtarget) {
9066 int Size = Mask.size();
9067 unsigned SizeInBits = Size * ScalarSizeInBits;
9069 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9070 for (int i = 0; i < Size; i += Scale)
9071 for (int j = 0; j < Shift; ++j)
9072 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9078 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9079 for (int i = 0; i != Size; i += Scale) {
9080 unsigned Pos = Left ? i + Shift : i;
9081 unsigned Low = Left ? i : i + Shift;
9082 unsigned Len = Scale - Shift;
9083 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9087 int ShiftEltBits = ScalarSizeInBits * Scale;
9088 bool ByteShift = ShiftEltBits > 64;
9089 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9090 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9091 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9093 // Normalize the scale for byte shifts to still produce an i64 element
9095 Scale = ByteShift ? Scale / 2 : Scale;
9097 // We need to round trip through the appropriate type for the shift.
9098 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9099 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9100 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9101 return (int)ShiftAmt;
9104 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9105 // keep doubling the size of the integer elements up to that. We can
9106 // then shift the elements of the integer vector by whole multiples of
9107 // their width within the elements of the larger integer vector. Test each
9108 // multiple to see if we can find a match with the moved element indices
9109 // and that the shifted in elements are all zeroable.
9110 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9111 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9112 for (int Shift = 1; Shift != Scale; ++Shift)
9113 for (bool Left : {true, false})
9114 if (CheckZeros(Shift, Scale, Left)) {
9115 int ShiftAmt = MatchShift(Shift, Scale, Left);
9124 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9125 SDValue V2, ArrayRef<int> Mask,
9126 const APInt &Zeroable,
9127 const X86Subtarget &Subtarget,
9128 SelectionDAG &DAG) {
9129 int Size = Mask.size();
9130 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9136 // Try to match shuffle against V1 shift.
9137 int ShiftAmt = matchVectorShuffleAsShift(
9138 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9140 // If V1 failed, try to match shuffle against V2 shift.
9143 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9144 Mask, Size, Zeroable, Subtarget);
9151 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9152 "Illegal integer vector type");
9153 V = DAG.getBitcast(ShiftVT, V);
9154 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9155 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9156 return DAG.getBitcast(VT, V);
9159 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9160 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9161 SDValue V2, ArrayRef<int> Mask,
9162 const APInt &Zeroable,
9163 SelectionDAG &DAG) {
9164 int Size = Mask.size();
9165 int HalfSize = Size / 2;
9166 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9167 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9169 // Upper half must be undefined.
9170 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9173 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9174 // Remainder of lower half result is zero and upper half is all undef.
9175 auto LowerAsEXTRQ = [&]() {
9176 // Determine the extraction length from the part of the
9177 // lower half that isn't zeroable.
9179 for (; Len > 0; --Len)
9180 if (!Zeroable[Len - 1])
9182 assert(Len > 0 && "Zeroable shuffle mask");
9184 // Attempt to match first Len sequential elements from the lower half.
9187 for (int i = 0; i != Len; ++i) {
9191 SDValue &V = (M < Size ? V1 : V2);
9194 // The extracted elements must start at a valid index and all mask
9195 // elements must be in the lower half.
9196 if (i > M || M >= HalfSize)
9199 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9210 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9211 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9212 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9213 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
9214 DAG.getConstant(BitLen, DL, MVT::i8),
9215 DAG.getConstant(BitIdx, DL, MVT::i8));
9218 if (SDValue ExtrQ = LowerAsEXTRQ())
9221 // INSERTQ: Extract lowest Len elements from lower half of second source and
9222 // insert over first source, starting at Idx.
9223 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9224 auto LowerAsInsertQ = [&]() {
9225 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9228 // Attempt to match first source from mask before insertion point.
9229 if (isUndefInRange(Mask, 0, Idx)) {
9231 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9233 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9239 // Extend the extraction length looking to match both the insertion of
9240 // the second source and the remaining elements of the first.
9241 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9246 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9248 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9254 // Match the remaining elements of the lower half.
9255 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9257 } else if ((!Base || (Base == V1)) &&
9258 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9260 } else if ((!Base || (Base == V2)) &&
9261 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9268 // We may not have a base (first source) - this can safely be undefined.
9270 Base = DAG.getUNDEF(VT);
9272 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9273 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9274 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
9275 DAG.getConstant(BitLen, DL, MVT::i8),
9276 DAG.getConstant(BitIdx, DL, MVT::i8));
9283 if (SDValue InsertQ = LowerAsInsertQ())
9289 /// \brief Lower a vector shuffle as a zero or any extension.
9291 /// Given a specific number of elements, element bit width, and extension
9292 /// stride, produce either a zero or any extension based on the available
9293 /// features of the subtarget. The extended elements are consecutive and
9294 /// begin and can start from an offsetted element index in the input; to
9295 /// avoid excess shuffling the offset must either being in the bottom lane
9296 /// or at the start of a higher lane. All extended elements must be from
9298 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9299 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9300 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9301 assert(Scale > 1 && "Need a scale to extend.");
9302 int EltBits = VT.getScalarSizeInBits();
9303 int NumElements = VT.getVectorNumElements();
9304 int NumEltsPerLane = 128 / EltBits;
9305 int OffsetLane = Offset / NumEltsPerLane;
9306 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9307 "Only 8, 16, and 32 bit elements can be extended.");
9308 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9309 assert(0 <= Offset && "Extension offset must be positive.");
9310 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9311 "Extension offset must be in the first lane or start an upper lane.");
9313 // Check that an index is in same lane as the base offset.
9314 auto SafeOffset = [&](int Idx) {
9315 return OffsetLane == (Idx / NumEltsPerLane);
9318 // Shift along an input so that the offset base moves to the first element.
9319 auto ShuffleOffset = [&](SDValue V) {
9323 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9324 for (int i = 0; i * Scale < NumElements; ++i) {
9325 int SrcIdx = i + Offset;
9326 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9328 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9331 // Found a valid zext mask! Try various lowering strategies based on the
9332 // input type and available ISA extensions.
9333 if (Subtarget.hasSSE41()) {
9334 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9335 // PUNPCK will catch this in a later shuffle match.
9336 if (Offset && Scale == 2 && VT.is128BitVector())
9338 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9339 NumElements / Scale);
9340 InputV = ShuffleOffset(InputV);
9341 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9342 return DAG.getBitcast(VT, InputV);
9345 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9347 // For any extends we can cheat for larger element sizes and use shuffle
9348 // instructions that can fold with a load and/or copy.
9349 if (AnyExt && EltBits == 32) {
9350 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9352 return DAG.getBitcast(
9353 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9354 DAG.getBitcast(MVT::v4i32, InputV),
9355 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9357 if (AnyExt && EltBits == 16 && Scale > 2) {
9358 int PSHUFDMask[4] = {Offset / 2, -1,
9359 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9360 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9361 DAG.getBitcast(MVT::v4i32, InputV),
9362 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9363 int PSHUFWMask[4] = {1, -1, -1, -1};
9364 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9365 return DAG.getBitcast(
9366 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9367 DAG.getBitcast(MVT::v8i16, InputV),
9368 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9371 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9373 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9374 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9375 assert(VT.is128BitVector() && "Unexpected vector width!");
9377 int LoIdx = Offset * EltBits;
9378 SDValue Lo = DAG.getBitcast(
9379 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9380 DAG.getConstant(EltBits, DL, MVT::i8),
9381 DAG.getConstant(LoIdx, DL, MVT::i8)));
9383 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9384 !SafeOffset(Offset + 1))
9385 return DAG.getBitcast(VT, Lo);
9387 int HiIdx = (Offset + 1) * EltBits;
9388 SDValue Hi = DAG.getBitcast(
9389 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9390 DAG.getConstant(EltBits, DL, MVT::i8),
9391 DAG.getConstant(HiIdx, DL, MVT::i8)));
9392 return DAG.getBitcast(VT,
9393 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9396 // If this would require more than 2 unpack instructions to expand, use
9397 // pshufb when available. We can only use more than 2 unpack instructions
9398 // when zero extending i8 elements which also makes it easier to use pshufb.
9399 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9400 assert(NumElements == 16 && "Unexpected byte vector width!");
9401 SDValue PSHUFBMask[16];
9402 for (int i = 0; i < 16; ++i) {
9403 int Idx = Offset + (i / Scale);
9404 PSHUFBMask[i] = DAG.getConstant(
9405 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9407 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9408 return DAG.getBitcast(
9409 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9410 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9413 // If we are extending from an offset, ensure we start on a boundary that
9414 // we can unpack from.
9415 int AlignToUnpack = Offset % (NumElements / Scale);
9416 if (AlignToUnpack) {
9417 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9418 for (int i = AlignToUnpack; i < NumElements; ++i)
9419 ShMask[i - AlignToUnpack] = i;
9420 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9421 Offset -= AlignToUnpack;
9424 // Otherwise emit a sequence of unpacks.
9426 unsigned UnpackLoHi = X86ISD::UNPCKL;
9427 if (Offset >= (NumElements / 2)) {
9428 UnpackLoHi = X86ISD::UNPCKH;
9429 Offset -= (NumElements / 2);
9432 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9433 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9434 : getZeroVector(InputVT, Subtarget, DAG, DL);
9435 InputV = DAG.getBitcast(InputVT, InputV);
9436 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9440 } while (Scale > 1);
9441 return DAG.getBitcast(VT, InputV);
9444 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9446 /// This routine will try to do everything in its power to cleverly lower
9447 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9448 /// check for the profitability of this lowering, it tries to aggressively
9449 /// match this pattern. It will use all of the micro-architectural details it
9450 /// can to emit an efficient lowering. It handles both blends with all-zero
9451 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9452 /// masking out later).
9454 /// The reason we have dedicated lowering for zext-style shuffles is that they
9455 /// are both incredibly common and often quite performance sensitive.
9456 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9457 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9458 const APInt &Zeroable, const X86Subtarget &Subtarget,
9459 SelectionDAG &DAG) {
9460 int Bits = VT.getSizeInBits();
9461 int NumLanes = Bits / 128;
9462 int NumElements = VT.getVectorNumElements();
9463 int NumEltsPerLane = NumElements / NumLanes;
9464 assert(VT.getScalarSizeInBits() <= 32 &&
9465 "Exceeds 32-bit integer zero extension limit");
9466 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9468 // Define a helper function to check a particular ext-scale and lower to it if
9470 auto Lower = [&](int Scale) -> SDValue {
9475 for (int i = 0; i < NumElements; ++i) {
9478 continue; // Valid anywhere but doesn't tell us anything.
9479 if (i % Scale != 0) {
9480 // Each of the extended elements need to be zeroable.
9484 // We no longer are in the anyext case.
9489 // Each of the base elements needs to be consecutive indices into the
9490 // same input vector.
9491 SDValue V = M < NumElements ? V1 : V2;
9492 M = M % NumElements;
9495 Offset = M - (i / Scale);
9496 } else if (InputV != V)
9497 return SDValue(); // Flip-flopping inputs.
9499 // Offset must start in the lowest 128-bit lane or at the start of an
9501 // FIXME: Is it ever worth allowing a negative base offset?
9502 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9503 (Offset % NumEltsPerLane) == 0))
9506 // If we are offsetting, all referenced entries must come from the same
9508 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9511 if ((M % NumElements) != (Offset + (i / Scale)))
9512 return SDValue(); // Non-consecutive strided elements.
9516 // If we fail to find an input, we have a zero-shuffle which should always
9517 // have already been handled.
9518 // FIXME: Maybe handle this here in case during blending we end up with one?
9522 // If we are offsetting, don't extend if we only match a single input, we
9523 // can always do better by using a basic PSHUF or PUNPCK.
9524 if (Offset != 0 && Matches < 2)
9527 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9528 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9531 // The widest scale possible for extending is to a 64-bit integer.
9532 assert(Bits % 64 == 0 &&
9533 "The number of bits in a vector must be divisible by 64 on x86!");
9534 int NumExtElements = Bits / 64;
9536 // Each iteration, try extending the elements half as much, but into twice as
9538 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9539 assert(NumElements % NumExtElements == 0 &&
9540 "The input vector size must be divisible by the extended size.");
9541 if (SDValue V = Lower(NumElements / NumExtElements))
9545 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9549 // Returns one of the source operands if the shuffle can be reduced to a
9550 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9551 auto CanZExtLowHalf = [&]() {
9552 for (int i = NumElements / 2; i != NumElements; ++i)
9555 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9557 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9562 if (SDValue V = CanZExtLowHalf()) {
9563 V = DAG.getBitcast(MVT::v2i64, V);
9564 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9565 return DAG.getBitcast(VT, V);
9568 // No viable ext lowering found.
9572 /// \brief Try to get a scalar value for a specific element of a vector.
9574 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9575 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9576 SelectionDAG &DAG) {
9577 MVT VT = V.getSimpleValueType();
9578 MVT EltVT = VT.getVectorElementType();
9579 V = peekThroughBitcasts(V);
9581 // If the bitcasts shift the element size, we can't extract an equivalent
9583 MVT NewVT = V.getSimpleValueType();
9584 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9587 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9588 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9589 // Ensure the scalar operand is the same size as the destination.
9590 // FIXME: Add support for scalar truncation where possible.
9591 SDValue S = V.getOperand(Idx);
9592 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9593 return DAG.getBitcast(EltVT, S);
9599 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9601 /// This is particularly important because the set of instructions varies
9602 /// significantly based on whether the operand is a load or not.
9603 static bool isShuffleFoldableLoad(SDValue V) {
9604 V = peekThroughBitcasts(V);
9605 return ISD::isNON_EXTLoad(V.getNode());
9608 /// \brief Try to lower insertion of a single element into a zero vector.
9610 /// This is a common pattern that we have especially efficient patterns to lower
9611 /// across all subtarget feature sets.
9612 static SDValue lowerVectorShuffleAsElementInsertion(
9613 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9614 const APInt &Zeroable, const X86Subtarget &Subtarget,
9615 SelectionDAG &DAG) {
9617 MVT EltVT = VT.getVectorElementType();
9620 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9622 bool IsV1Zeroable = true;
9623 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9624 if (i != V2Index && !Zeroable[i]) {
9625 IsV1Zeroable = false;
9629 // Check for a single input from a SCALAR_TO_VECTOR node.
9630 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9631 // all the smarts here sunk into that routine. However, the current
9632 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9633 // vector shuffle lowering is dead.
9634 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9636 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9637 // We need to zext the scalar if it is smaller than an i32.
9638 V2S = DAG.getBitcast(EltVT, V2S);
9639 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9640 // Using zext to expand a narrow element won't work for non-zero
9645 // Zero-extend directly to i32.
9647 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9649 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9650 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9651 EltVT == MVT::i16) {
9652 // Either not inserting from the low element of the input or the input
9653 // element size is too small to use VZEXT_MOVL to clear the high bits.
9657 if (!IsV1Zeroable) {
9658 // If V1 can't be treated as a zero vector we have fewer options to lower
9659 // this. We can't support integer vectors or non-zero targets cheaply, and
9660 // the V1 elements can't be permuted in any way.
9661 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9662 if (!VT.isFloatingPoint() || V2Index != 0)
9664 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9665 V1Mask[V2Index] = -1;
9666 if (!isNoopShuffleMask(V1Mask))
9668 // This is essentially a special case blend operation, but if we have
9669 // general purpose blend operations, they are always faster. Bail and let
9670 // the rest of the lowering handle these as blends.
9671 if (Subtarget.hasSSE41())
9674 // Otherwise, use MOVSD or MOVSS.
9675 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9676 "Only two types of floating point element types to handle!");
9677 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9681 // This lowering only works for the low element with floating point vectors.
9682 if (VT.isFloatingPoint() && V2Index != 0)
9685 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9687 V2 = DAG.getBitcast(VT, V2);
9690 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9691 // the desired position. Otherwise it is more efficient to do a vector
9692 // shift left. We know that we can do a vector shift left because all
9693 // the inputs are zero.
9694 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9695 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9696 V2Shuffle[V2Index] = 0;
9697 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9699 V2 = DAG.getBitcast(MVT::v16i8, V2);
9701 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9702 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9703 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9704 DAG.getDataLayout(), VT)));
9705 V2 = DAG.getBitcast(VT, V2);
9711 /// Try to lower broadcast of a single - truncated - integer element,
9712 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9714 /// This assumes we have AVX2.
9715 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9716 SDValue V0, int BroadcastIdx,
9717 const X86Subtarget &Subtarget,
9718 SelectionDAG &DAG) {
9719 assert(Subtarget.hasAVX2() &&
9720 "We can only lower integer broadcasts with AVX2!");
9722 EVT EltVT = VT.getVectorElementType();
9723 EVT V0VT = V0.getValueType();
9725 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9726 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9728 EVT V0EltVT = V0VT.getVectorElementType();
9729 if (!V0EltVT.isInteger())
9732 const unsigned EltSize = EltVT.getSizeInBits();
9733 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9735 // This is only a truncation if the original element type is larger.
9736 if (V0EltSize <= EltSize)
9739 assert(((V0EltSize % EltSize) == 0) &&
9740 "Scalar type sizes must all be powers of 2 on x86!");
9742 const unsigned V0Opc = V0.getOpcode();
9743 const unsigned Scale = V0EltSize / EltSize;
9744 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9746 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9747 V0Opc != ISD::BUILD_VECTOR)
9750 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9752 // If we're extracting non-least-significant bits, shift so we can truncate.
9753 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9754 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9755 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9756 if (const int OffsetIdx = BroadcastIdx % Scale)
9757 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9758 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9760 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9761 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9764 /// \brief Try to lower broadcast of a single element.
9766 /// For convenience, this code also bundles all of the subtarget feature set
9767 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9768 /// a convenient way to factor it out.
9769 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9770 SDValue V1, SDValue V2,
9772 const X86Subtarget &Subtarget,
9773 SelectionDAG &DAG) {
9774 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9775 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9776 (Subtarget.hasAVX2() && VT.isInteger())))
9779 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9780 // we can only broadcast from a register with AVX2.
9781 unsigned NumElts = Mask.size();
9782 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9783 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9785 // Check that the mask is a broadcast.
9786 int BroadcastIdx = -1;
9787 for (int i = 0; i != (int)NumElts; ++i) {
9788 SmallVector<int, 8> BroadcastMask(NumElts, i);
9789 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9795 if (BroadcastIdx < 0)
9797 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9798 "a sorted mask where the broadcast "
9801 // Go up the chain of (vector) values to find a scalar load that we can
9802 // combine with the broadcast.
9805 switch (V.getOpcode()) {
9806 case ISD::BITCAST: {
9807 SDValue VSrc = V.getOperand(0);
9808 MVT SrcVT = VSrc.getSimpleValueType();
9809 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9814 case ISD::CONCAT_VECTORS: {
9815 int OperandSize = Mask.size() / V.getNumOperands();
9816 V = V.getOperand(BroadcastIdx / OperandSize);
9817 BroadcastIdx %= OperandSize;
9820 case ISD::INSERT_SUBVECTOR: {
9821 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9822 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9826 int BeginIdx = (int)ConstantIdx->getZExtValue();
9828 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9829 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9830 BroadcastIdx -= BeginIdx;
9841 // Check if this is a broadcast of a scalar. We special case lowering
9842 // for scalars so that we can more effectively fold with loads.
9843 // First, look through bitcast: if the original value has a larger element
9844 // type than the shuffle, the broadcast element is in essence truncated.
9845 // Make that explicit to ease folding.
9846 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
9847 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
9848 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9849 return TruncBroadcast;
9851 MVT BroadcastVT = VT;
9853 // Peek through any bitcast (only useful for loads).
9854 SDValue BC = peekThroughBitcasts(V);
9856 // Also check the simpler case, where we can directly reuse the scalar.
9857 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9858 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
9859 V = V.getOperand(BroadcastIdx);
9861 // If we can't broadcast from a register, check that the input is a load.
9862 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
9864 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
9865 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9866 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
9867 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9868 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
9871 // If we are broadcasting a load that is only used by the shuffle
9872 // then we can reduce the vector load to the broadcasted scalar load.
9873 LoadSDNode *Ld = cast<LoadSDNode>(BC);
9874 SDValue BaseAddr = Ld->getOperand(1);
9875 EVT SVT = BroadcastVT.getScalarType();
9876 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
9877 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
9878 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
9879 DAG.getMachineFunction().getMachineMemOperand(
9880 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
9882 // Make sure the newly-created LOAD is in the same position as Ld in
9883 // terms of dependency. We create a TokenFactor for Ld and V,
9884 // and update uses of Ld's output chain to use the TokenFactor.
9885 if (Ld->hasAnyUseOfValue(1)) {
9886 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9887 SDValue(Ld, 1), SDValue(V.getNode(), 1));
9888 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
9889 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
9890 SDValue(V.getNode(), 1));
9892 } else if (!BroadcastFromReg) {
9893 // We can't broadcast from a vector register.
9895 } else if (BroadcastIdx != 0) {
9896 // We can only broadcast from the zero-element of a vector register,
9897 // but it can be advantageous to broadcast from the zero-element of a
9899 if (!VT.is256BitVector() && !VT.is512BitVector())
9902 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
9903 if (VT == MVT::v4f64 || VT == MVT::v4i64)
9906 // Only broadcast the zero-element of a 128-bit subvector.
9907 unsigned EltSize = VT.getScalarSizeInBits();
9908 if (((BroadcastIdx * EltSize) % 128) != 0)
9911 // The shuffle input might have been a bitcast we looked through; look at
9912 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
9913 // later bitcast it to BroadcastVT.
9914 MVT SrcVT = V.getSimpleValueType();
9915 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9916 "Unexpected vector element size");
9917 assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
9918 "Unexpected vector size");
9920 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
9921 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
9922 DAG.getIntPtrConstant(BroadcastIdx, DL));
9925 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
9926 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
9927 DAG.getBitcast(MVT::f64, V));
9929 // Bitcast back to the same scalar type as BroadcastVT.
9930 MVT SrcVT = V.getSimpleValueType();
9931 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
9932 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9933 "Unexpected vector element size");
9934 if (SrcVT.isVector()) {
9935 unsigned NumSrcElts = SrcVT.getVectorNumElements();
9936 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
9938 SrcVT = BroadcastVT.getScalarType();
9940 V = DAG.getBitcast(SrcVT, V);
9943 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9944 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
9945 V = DAG.getBitcast(MVT::f64, V);
9946 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
9947 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
9950 // We only support broadcasting from 128-bit vectors to minimize the
9951 // number of patterns we need to deal with in isel. So extract down to
9953 if (SrcVT.getSizeInBits() > 128)
9954 V = extract128BitVector(V, 0, DAG, DL);
9956 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
9959 // Check for whether we can use INSERTPS to perform the shuffle. We only use
9960 // INSERTPS when the V1 elements are already in the correct locations
9961 // because otherwise we can just always use two SHUFPS instructions which
9962 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
9963 // perform INSERTPS if a single V1 element is out of place and all V2
9964 // elements are zeroable.
9965 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
9966 unsigned &InsertPSMask,
9967 const APInt &Zeroable,
9969 SelectionDAG &DAG) {
9970 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
9971 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
9972 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9974 // Attempt to match INSERTPS with one element from VA or VB being
9975 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
9977 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
9978 ArrayRef<int> CandidateMask) {
9980 int VADstIndex = -1;
9981 int VBDstIndex = -1;
9982 bool VAUsedInPlace = false;
9984 for (int i = 0; i < 4; ++i) {
9985 // Synthesize a zero mask from the zeroable elements (includes undefs).
9991 // Flag if we use any VA inputs in place.
9992 if (i == CandidateMask[i]) {
9993 VAUsedInPlace = true;
9997 // We can only insert a single non-zeroable element.
9998 if (VADstIndex >= 0 || VBDstIndex >= 0)
10001 if (CandidateMask[i] < 4) {
10002 // VA input out of place for insertion.
10005 // VB input for insertion.
10010 // Don't bother if we have no (non-zeroable) element for insertion.
10011 if (VADstIndex < 0 && VBDstIndex < 0)
10014 // Determine element insertion src/dst indices. The src index is from the
10015 // start of the inserted vector, not the start of the concatenated vector.
10016 unsigned VBSrcIndex = 0;
10017 if (VADstIndex >= 0) {
10018 // If we have a VA input out of place, we use VA as the V2 element
10019 // insertion and don't use the original V2 at all.
10020 VBSrcIndex = CandidateMask[VADstIndex];
10021 VBDstIndex = VADstIndex;
10024 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10027 // If no V1 inputs are used in place, then the result is created only from
10028 // the zero mask and the V2 insertion - so remove V1 dependency.
10029 if (!VAUsedInPlace)
10030 VA = DAG.getUNDEF(MVT::v4f32);
10032 // Update V1, V2 and InsertPSMask accordingly.
10036 // Insert the V2 element into the desired position.
10037 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10038 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10042 if (matchAsInsertPS(V1, V2, Mask))
10045 // Commute and try again.
10046 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10047 ShuffleVectorSDNode::commuteMask(CommutedMask);
10048 if (matchAsInsertPS(V2, V1, CommutedMask))
10054 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10055 SDValue V2, ArrayRef<int> Mask,
10056 const APInt &Zeroable,
10057 SelectionDAG &DAG) {
10058 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10059 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10061 // Attempt to match the insertps pattern.
10062 unsigned InsertPSMask;
10063 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10066 // Insert the V2 element into the desired position.
10067 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10068 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10071 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10072 /// UNPCK instruction.
10074 /// This specifically targets cases where we end up with alternating between
10075 /// the two inputs, and so can permute them into something that feeds a single
10076 /// UNPCK instruction. Note that this routine only targets integer vectors
10077 /// because for floating point vectors we have a generalized SHUFPS lowering
10078 /// strategy that handles everything that doesn't *exactly* match an unpack,
10079 /// making this clever lowering unnecessary.
10080 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10081 SDValue V1, SDValue V2,
10082 ArrayRef<int> Mask,
10083 SelectionDAG &DAG) {
10084 assert(!VT.isFloatingPoint() &&
10085 "This routine only supports integer vectors.");
10086 assert(VT.is128BitVector() &&
10087 "This routine only works on 128-bit vectors.");
10088 assert(!V2.isUndef() &&
10089 "This routine should only be used when blending two inputs.");
10090 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10092 int Size = Mask.size();
10095 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10097 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10099 bool UnpackLo = NumLoInputs >= NumHiInputs;
10101 auto TryUnpack = [&](int ScalarSize, int Scale) {
10102 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10103 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10105 for (int i = 0; i < Size; ++i) {
10109 // Each element of the unpack contains Scale elements from this mask.
10110 int UnpackIdx = i / Scale;
10112 // We only handle the case where V1 feeds the first slots of the unpack.
10113 // We rely on canonicalization to ensure this is the case.
10114 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10117 // Setup the mask for this input. The indexing is tricky as we have to
10118 // handle the unpack stride.
10119 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10120 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10124 // If we will have to shuffle both inputs to use the unpack, check whether
10125 // we can just unpack first and shuffle the result. If so, skip this unpack.
10126 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10127 !isNoopShuffleMask(V2Mask))
10130 // Shuffle the inputs into place.
10131 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10132 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10134 // Cast the inputs to the type we will use to unpack them.
10135 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10136 V1 = DAG.getBitcast(UnpackVT, V1);
10137 V2 = DAG.getBitcast(UnpackVT, V2);
10139 // Unpack the inputs and cast the result back to the desired type.
10140 return DAG.getBitcast(
10141 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10142 UnpackVT, V1, V2));
10145 // We try each unpack from the largest to the smallest to try and find one
10146 // that fits this mask.
10147 int OrigScalarSize = VT.getScalarSizeInBits();
10148 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10149 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10152 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10154 if (NumLoInputs == 0 || NumHiInputs == 0) {
10155 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10156 "We have to have *some* inputs!");
10157 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10159 // FIXME: We could consider the total complexity of the permute of each
10160 // possible unpacking. Or at the least we should consider how many
10161 // half-crossings are created.
10162 // FIXME: We could consider commuting the unpacks.
10164 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10165 for (int i = 0; i < Size; ++i) {
10169 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10172 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10174 return DAG.getVectorShuffle(
10175 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10177 DAG.getUNDEF(VT), PermMask);
10183 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10185 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10186 /// support for floating point shuffles but not integer shuffles. These
10187 /// instructions will incur a domain crossing penalty on some chips though so
10188 /// it is better to avoid lowering through this for integer vectors where
10190 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10191 const APInt &Zeroable,
10192 SDValue V1, SDValue V2,
10193 const X86Subtarget &Subtarget,
10194 SelectionDAG &DAG) {
10195 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10196 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10197 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10199 if (V2.isUndef()) {
10200 // Check for being able to broadcast a single element.
10201 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10202 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10205 // Straight shuffle of a single input vector. Simulate this by using the
10206 // single input as both of the "inputs" to this instruction..
10207 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10209 if (Subtarget.hasAVX()) {
10210 // If we have AVX, we can use VPERMILPS which will allow folding a load
10211 // into the shuffle.
10212 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10213 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10216 return DAG.getNode(
10217 X86ISD::SHUFP, DL, MVT::v2f64,
10218 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10219 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10220 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10222 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10223 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10225 // If we have a single input, insert that into V1 if we can do so cheaply.
10226 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10227 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10228 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10230 // Try inverting the insertion since for v2 masks it is easy to do and we
10231 // can't reliably sort the mask one way or the other.
10232 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10233 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10234 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10235 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10239 // Try to use one of the special instruction patterns to handle two common
10240 // blend patterns if a zero-blend above didn't work.
10241 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10242 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10243 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10244 // We can either use a special instruction to load over the low double or
10245 // to move just the low double.
10246 return DAG.getNode(
10247 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10248 DL, MVT::v2f64, V2,
10249 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10251 if (Subtarget.hasSSE41())
10252 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10253 Zeroable, Subtarget, DAG))
10256 // Use dedicated unpack instructions for masks that match their pattern.
10258 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10261 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10262 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10263 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10266 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10268 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10269 /// the integer unit to minimize domain crossing penalties. However, for blends
10270 /// it falls back to the floating point shuffle operation with appropriate bit
10272 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10273 const APInt &Zeroable,
10274 SDValue V1, SDValue V2,
10275 const X86Subtarget &Subtarget,
10276 SelectionDAG &DAG) {
10277 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10278 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10279 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10281 if (V2.isUndef()) {
10282 // Check for being able to broadcast a single element.
10283 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10284 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10287 // Straight shuffle of a single input vector. For everything from SSE2
10288 // onward this has a single fast instruction with no scary immediates.
10289 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10290 V1 = DAG.getBitcast(MVT::v4i32, V1);
10291 int WidenedMask[4] = {
10292 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10293 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10294 return DAG.getBitcast(
10296 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10297 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10299 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10300 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10301 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10302 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10304 // If we have a blend of two same-type PACKUS operations and the blend aligns
10305 // with the low and high halves, we can just merge the PACKUS operations.
10306 // This is particularly important as it lets us merge shuffles that this
10307 // routine itself creates.
10308 auto GetPackNode = [](SDValue V) {
10309 V = peekThroughBitcasts(V);
10310 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10312 if (SDValue V1Pack = GetPackNode(V1))
10313 if (SDValue V2Pack = GetPackNode(V2)) {
10314 EVT PackVT = V1Pack.getValueType();
10315 if (PackVT == V2Pack.getValueType())
10316 return DAG.getBitcast(MVT::v2i64,
10317 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10318 Mask[0] == 0 ? V1Pack.getOperand(0)
10319 : V1Pack.getOperand(1),
10320 Mask[1] == 2 ? V2Pack.getOperand(0)
10321 : V2Pack.getOperand(1)));
10324 // Try to use shift instructions.
10325 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10326 Zeroable, Subtarget, DAG))
10329 // When loading a scalar and then shuffling it into a vector we can often do
10330 // the insertion cheaply.
10331 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10332 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10334 // Try inverting the insertion since for v2 masks it is easy to do and we
10335 // can't reliably sort the mask one way or the other.
10336 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10337 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10338 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10341 // We have different paths for blend lowering, but they all must use the
10342 // *exact* same predicate.
10343 bool IsBlendSupported = Subtarget.hasSSE41();
10344 if (IsBlendSupported)
10345 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10346 Zeroable, Subtarget, DAG))
10349 // Use dedicated unpack instructions for masks that match their pattern.
10351 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10354 // Try to use byte rotation instructions.
10355 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10356 if (Subtarget.hasSSSE3())
10357 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10358 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10361 // If we have direct support for blends, we should lower by decomposing into
10362 // a permute. That will be faster than the domain cross.
10363 if (IsBlendSupported)
10364 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10367 // We implement this with SHUFPD which is pretty lame because it will likely
10368 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10369 // However, all the alternatives are still more cycles and newer chips don't
10370 // have this problem. It would be really nice if x86 had better shuffles here.
10371 V1 = DAG.getBitcast(MVT::v2f64, V1);
10372 V2 = DAG.getBitcast(MVT::v2f64, V2);
10373 return DAG.getBitcast(MVT::v2i64,
10374 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10377 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10379 /// This is used to disable more specialized lowerings when the shufps lowering
10380 /// will happen to be efficient.
10381 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10382 // This routine only handles 128-bit shufps.
10383 assert(Mask.size() == 4 && "Unsupported mask size!");
10384 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10385 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10386 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10387 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10389 // To lower with a single SHUFPS we need to have the low half and high half
10390 // each requiring a single input.
10391 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10393 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10399 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10401 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10402 /// It makes no assumptions about whether this is the *best* lowering, it simply
10404 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10405 ArrayRef<int> Mask, SDValue V1,
10406 SDValue V2, SelectionDAG &DAG) {
10407 SDValue LowV = V1, HighV = V2;
10408 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10410 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10412 if (NumV2Elements == 1) {
10413 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10415 // Compute the index adjacent to V2Index and in the same half by toggling
10417 int V2AdjIndex = V2Index ^ 1;
10419 if (Mask[V2AdjIndex] < 0) {
10420 // Handles all the cases where we have a single V2 element and an undef.
10421 // This will only ever happen in the high lanes because we commute the
10422 // vector otherwise.
10424 std::swap(LowV, HighV);
10425 NewMask[V2Index] -= 4;
10427 // Handle the case where the V2 element ends up adjacent to a V1 element.
10428 // To make this work, blend them together as the first step.
10429 int V1Index = V2AdjIndex;
10430 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10431 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10432 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10434 // Now proceed to reconstruct the final blend as we have the necessary
10435 // high or low half formed.
10442 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10443 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10445 } else if (NumV2Elements == 2) {
10446 if (Mask[0] < 4 && Mask[1] < 4) {
10447 // Handle the easy case where we have V1 in the low lanes and V2 in the
10451 } else if (Mask[2] < 4 && Mask[3] < 4) {
10452 // We also handle the reversed case because this utility may get called
10453 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10454 // arrange things in the right direction.
10460 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10461 // trying to place elements directly, just blend them and set up the final
10462 // shuffle to place them.
10464 // The first two blend mask elements are for V1, the second two are for
10466 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10467 Mask[2] < 4 ? Mask[2] : Mask[3],
10468 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10469 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10470 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10471 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10473 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10476 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10477 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10478 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10479 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10482 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10483 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10486 /// \brief Lower 4-lane 32-bit floating point shuffles.
10488 /// Uses instructions exclusively from the floating point unit to minimize
10489 /// domain crossing penalties, as these are sufficient to implement all v4f32
10491 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10492 const APInt &Zeroable,
10493 SDValue V1, SDValue V2,
10494 const X86Subtarget &Subtarget,
10495 SelectionDAG &DAG) {
10496 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10497 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10498 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10500 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10502 if (NumV2Elements == 0) {
10503 // Check for being able to broadcast a single element.
10504 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10505 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10508 // Use even/odd duplicate instructions for masks that match their pattern.
10509 if (Subtarget.hasSSE3()) {
10510 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10511 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10512 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10513 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10516 if (Subtarget.hasAVX()) {
10517 // If we have AVX, we can use VPERMILPS which will allow folding a load
10518 // into the shuffle.
10519 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10520 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10523 // Otherwise, use a straight shuffle of a single input vector. We pass the
10524 // input vector to both operands to simulate this with a SHUFPS.
10525 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10526 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10529 // There are special ways we can lower some single-element blends. However, we
10530 // have custom ways we can lower more complex single-element blends below that
10531 // we defer to if both this and BLENDPS fail to match, so restrict this to
10532 // when the V2 input is targeting element 0 of the mask -- that is the fast
10534 if (NumV2Elements == 1 && Mask[0] >= 4)
10535 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10536 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10539 if (Subtarget.hasSSE41()) {
10540 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10541 Zeroable, Subtarget, DAG))
10544 // Use INSERTPS if we can complete the shuffle efficiently.
10546 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10549 if (!isSingleSHUFPSMask(Mask))
10550 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10551 DL, MVT::v4f32, V1, V2, Mask, DAG))
10555 // Use low/high mov instructions.
10556 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10557 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10558 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10559 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10561 // Use dedicated unpack instructions for masks that match their pattern.
10563 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10566 // Otherwise fall back to a SHUFPS lowering strategy.
10567 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10570 /// \brief Lower 4-lane i32 vector shuffles.
10572 /// We try to handle these with integer-domain shuffles where we can, but for
10573 /// blends we use the floating point domain blend instructions.
10574 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10575 const APInt &Zeroable,
10576 SDValue V1, SDValue V2,
10577 const X86Subtarget &Subtarget,
10578 SelectionDAG &DAG) {
10579 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10580 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10581 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10583 // Whenever we can lower this as a zext, that instruction is strictly faster
10584 // than any alternative. It also allows us to fold memory operands into the
10585 // shuffle in many cases.
10586 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10587 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10590 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10592 if (NumV2Elements == 0) {
10593 // Check for being able to broadcast a single element.
10594 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10595 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10598 // Straight shuffle of a single input vector. For everything from SSE2
10599 // onward this has a single fast instruction with no scary immediates.
10600 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10601 // but we aren't actually going to use the UNPCK instruction because doing
10602 // so prevents folding a load into this instruction or making a copy.
10603 const int UnpackLoMask[] = {0, 0, 1, 1};
10604 const int UnpackHiMask[] = {2, 2, 3, 3};
10605 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10606 Mask = UnpackLoMask;
10607 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10608 Mask = UnpackHiMask;
10610 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10611 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10614 // Try to use shift instructions.
10615 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10616 Zeroable, Subtarget, DAG))
10619 // There are special ways we can lower some single-element blends.
10620 if (NumV2Elements == 1)
10621 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10622 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10625 // We have different paths for blend lowering, but they all must use the
10626 // *exact* same predicate.
10627 bool IsBlendSupported = Subtarget.hasSSE41();
10628 if (IsBlendSupported)
10629 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10630 Zeroable, Subtarget, DAG))
10633 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10637 // Use dedicated unpack instructions for masks that match their pattern.
10639 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10642 // Try to use byte rotation instructions.
10643 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10644 if (Subtarget.hasSSSE3())
10645 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10646 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10649 // Assume that a single SHUFPS is faster than an alternative sequence of
10650 // multiple instructions (even if the CPU has a domain penalty).
10651 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10652 if (!isSingleSHUFPSMask(Mask)) {
10653 // If we have direct support for blends, we should lower by decomposing into
10654 // a permute. That will be faster than the domain cross.
10655 if (IsBlendSupported)
10656 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10659 // Try to lower by permuting the inputs into an unpack instruction.
10660 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10661 DL, MVT::v4i32, V1, V2, Mask, DAG))
10665 // We implement this with SHUFPS because it can blend from two vectors.
10666 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10667 // up the inputs, bypassing domain shift penalties that we would incur if we
10668 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10670 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10671 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10672 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10673 return DAG.getBitcast(MVT::v4i32, ShufPS);
10676 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10677 /// shuffle lowering, and the most complex part.
10679 /// The lowering strategy is to try to form pairs of input lanes which are
10680 /// targeted at the same half of the final vector, and then use a dword shuffle
10681 /// to place them onto the right half, and finally unpack the paired lanes into
10682 /// their final position.
10684 /// The exact breakdown of how to form these dword pairs and align them on the
10685 /// correct sides is really tricky. See the comments within the function for
10686 /// more of the details.
10688 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10689 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10690 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10691 /// vector, form the analogous 128-bit 8-element Mask.
10692 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10693 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10694 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10695 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10696 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10698 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10699 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10700 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10702 SmallVector<int, 4> LoInputs;
10703 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10704 std::sort(LoInputs.begin(), LoInputs.end());
10705 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10706 SmallVector<int, 4> HiInputs;
10707 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10708 std::sort(HiInputs.begin(), HiInputs.end());
10709 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10711 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10712 int NumHToL = LoInputs.size() - NumLToL;
10714 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10715 int NumHToH = HiInputs.size() - NumLToH;
10716 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10717 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10718 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10719 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10721 // If we are splatting two values from one half - one to each half, then
10722 // we can shuffle that half so each is splatted to a dword, then splat those
10723 // to their respective halves.
10724 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10726 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10727 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10728 V = DAG.getNode(ShufWOp, DL, VT, V,
10729 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10730 V = DAG.getBitcast(PSHUFDVT, V);
10731 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10732 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10733 return DAG.getBitcast(VT, V);
10736 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10737 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10738 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10739 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10741 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10742 // such inputs we can swap two of the dwords across the half mark and end up
10743 // with <=2 inputs to each half in each half. Once there, we can fall through
10744 // to the generic code below. For example:
10746 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10747 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10749 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10750 // and an existing 2-into-2 on the other half. In this case we may have to
10751 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10752 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10753 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10754 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10755 // half than the one we target for fixing) will be fixed when we re-enter this
10756 // path. We will also combine away any sequence of PSHUFD instructions that
10757 // result into a single instruction. Here is an example of the tricky case:
10759 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10760 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10762 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10764 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10765 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10767 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10768 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10770 // The result is fine to be handled by the generic logic.
10771 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10772 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10773 int AOffset, int BOffset) {
10774 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10775 "Must call this with A having 3 or 1 inputs from the A half.");
10776 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10777 "Must call this with B having 1 or 3 inputs from the B half.");
10778 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10779 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10781 bool ThreeAInputs = AToAInputs.size() == 3;
10783 // Compute the index of dword with only one word among the three inputs in
10784 // a half by taking the sum of the half with three inputs and subtracting
10785 // the sum of the actual three inputs. The difference is the remaining
10787 int ADWord, BDWord;
10788 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10789 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10790 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10791 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10792 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10793 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10794 int TripleNonInputIdx =
10795 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10796 TripleDWord = TripleNonInputIdx / 2;
10798 // We use xor with one to compute the adjacent DWord to whichever one the
10800 OneInputDWord = (OneInput / 2) ^ 1;
10802 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10803 // and BToA inputs. If there is also such a problem with the BToB and AToB
10804 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10805 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10806 // is essential that we don't *create* a 3<-1 as then we might oscillate.
10807 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10808 // Compute how many inputs will be flipped by swapping these DWords. We
10810 // to balance this to ensure we don't form a 3-1 shuffle in the other
10812 int NumFlippedAToBInputs =
10813 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10814 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10815 int NumFlippedBToBInputs =
10816 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10817 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10818 if ((NumFlippedAToBInputs == 1 &&
10819 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
10820 (NumFlippedBToBInputs == 1 &&
10821 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
10822 // We choose whether to fix the A half or B half based on whether that
10823 // half has zero flipped inputs. At zero, we may not be able to fix it
10824 // with that half. We also bias towards fixing the B half because that
10825 // will more commonly be the high half, and we have to bias one way.
10826 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10827 ArrayRef<int> Inputs) {
10828 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10829 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10830 // Determine whether the free index is in the flipped dword or the
10831 // unflipped dword based on where the pinned index is. We use this bit
10832 // in an xor to conditionally select the adjacent dword.
10833 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10834 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10835 if (IsFixIdxInput == IsFixFreeIdxInput)
10837 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10838 assert(IsFixIdxInput != IsFixFreeIdxInput &&
10839 "We need to be changing the number of flipped inputs!");
10840 int PSHUFHalfMask[] = {0, 1, 2, 3};
10841 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10842 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10844 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10846 for (int &M : Mask)
10847 if (M >= 0 && M == FixIdx)
10849 else if (M >= 0 && M == FixFreeIdx)
10852 if (NumFlippedBToBInputs != 0) {
10854 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
10855 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10857 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10858 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
10859 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10864 int PSHUFDMask[] = {0, 1, 2, 3};
10865 PSHUFDMask[ADWord] = BDWord;
10866 PSHUFDMask[BDWord] = ADWord;
10867 V = DAG.getBitcast(
10869 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10870 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10872 // Adjust the mask to match the new locations of A and B.
10873 for (int &M : Mask)
10874 if (M >= 0 && M/2 == ADWord)
10875 M = 2 * BDWord + M % 2;
10876 else if (M >= 0 && M/2 == BDWord)
10877 M = 2 * ADWord + M % 2;
10879 // Recurse back into this routine to re-compute state now that this isn't
10880 // a 3 and 1 problem.
10881 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
10884 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
10885 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
10886 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
10887 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
10889 // At this point there are at most two inputs to the low and high halves from
10890 // each half. That means the inputs can always be grouped into dwords and
10891 // those dwords can then be moved to the correct half with a dword shuffle.
10892 // We use at most one low and one high word shuffle to collect these paired
10893 // inputs into dwords, and finally a dword shuffle to place them.
10894 int PSHUFLMask[4] = {-1, -1, -1, -1};
10895 int PSHUFHMask[4] = {-1, -1, -1, -1};
10896 int PSHUFDMask[4] = {-1, -1, -1, -1};
10898 // First fix the masks for all the inputs that are staying in their
10899 // original halves. This will then dictate the targets of the cross-half
10901 auto fixInPlaceInputs =
10902 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
10903 MutableArrayRef<int> SourceHalfMask,
10904 MutableArrayRef<int> HalfMask, int HalfOffset) {
10905 if (InPlaceInputs.empty())
10907 if (InPlaceInputs.size() == 1) {
10908 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10909 InPlaceInputs[0] - HalfOffset;
10910 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
10913 if (IncomingInputs.empty()) {
10914 // Just fix all of the in place inputs.
10915 for (int Input : InPlaceInputs) {
10916 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
10917 PSHUFDMask[Input / 2] = Input / 2;
10922 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
10923 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10924 InPlaceInputs[0] - HalfOffset;
10925 // Put the second input next to the first so that they are packed into
10926 // a dword. We find the adjacent index by toggling the low bit.
10927 int AdjIndex = InPlaceInputs[0] ^ 1;
10928 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
10929 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
10930 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
10932 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
10933 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
10935 // Now gather the cross-half inputs and place them into a free dword of
10936 // their target half.
10937 // FIXME: This operation could almost certainly be simplified dramatically to
10938 // look more like the 3-1 fixing operation.
10939 auto moveInputsToRightHalf = [&PSHUFDMask](
10940 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
10941 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
10942 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
10944 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
10945 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
10947 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
10949 int LowWord = Word & ~1;
10950 int HighWord = Word | 1;
10951 return isWordClobbered(SourceHalfMask, LowWord) ||
10952 isWordClobbered(SourceHalfMask, HighWord);
10955 if (IncomingInputs.empty())
10958 if (ExistingInputs.empty()) {
10959 // Map any dwords with inputs from them into the right half.
10960 for (int Input : IncomingInputs) {
10961 // If the source half mask maps over the inputs, turn those into
10962 // swaps and use the swapped lane.
10963 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
10964 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
10965 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
10966 Input - SourceOffset;
10967 // We have to swap the uses in our half mask in one sweep.
10968 for (int &M : HalfMask)
10969 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
10971 else if (M == Input)
10972 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10974 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
10975 Input - SourceOffset &&
10976 "Previous placement doesn't match!");
10978 // Note that this correctly re-maps both when we do a swap and when
10979 // we observe the other side of the swap above. We rely on that to
10980 // avoid swapping the members of the input list directly.
10981 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10984 // Map the input's dword into the correct half.
10985 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
10986 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
10988 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
10990 "Previous placement doesn't match!");
10993 // And just directly shift any other-half mask elements to be same-half
10994 // as we will have mirrored the dword containing the element into the
10995 // same position within that half.
10996 for (int &M : HalfMask)
10997 if (M >= SourceOffset && M < SourceOffset + 4) {
10998 M = M - SourceOffset + DestOffset;
10999 assert(M >= 0 && "This should never wrap below zero!");
11004 // Ensure we have the input in a viable dword of its current half. This
11005 // is particularly tricky because the original position may be clobbered
11006 // by inputs being moved and *staying* in that half.
11007 if (IncomingInputs.size() == 1) {
11008 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11009 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11011 SourceHalfMask[InputFixed - SourceOffset] =
11012 IncomingInputs[0] - SourceOffset;
11013 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11015 IncomingInputs[0] = InputFixed;
11017 } else if (IncomingInputs.size() == 2) {
11018 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11019 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11020 // We have two non-adjacent or clobbered inputs we need to extract from
11021 // the source half. To do this, we need to map them into some adjacent
11022 // dword slot in the source mask.
11023 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11024 IncomingInputs[1] - SourceOffset};
11026 // If there is a free slot in the source half mask adjacent to one of
11027 // the inputs, place the other input in it. We use (Index XOR 1) to
11028 // compute an adjacent index.
11029 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11030 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11031 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11032 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11033 InputsFixed[1] = InputsFixed[0] ^ 1;
11034 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11035 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11036 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11037 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11038 InputsFixed[0] = InputsFixed[1] ^ 1;
11039 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11040 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11041 // The two inputs are in the same DWord but it is clobbered and the
11042 // adjacent DWord isn't used at all. Move both inputs to the free
11044 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11045 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11046 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11047 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11049 // The only way we hit this point is if there is no clobbering
11050 // (because there are no off-half inputs to this half) and there is no
11051 // free slot adjacent to one of the inputs. In this case, we have to
11052 // swap an input with a non-input.
11053 for (int i = 0; i < 4; ++i)
11054 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11055 "We can't handle any clobbers here!");
11056 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11057 "Cannot have adjacent inputs here!");
11059 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11060 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11062 // We also have to update the final source mask in this case because
11063 // it may need to undo the above swap.
11064 for (int &M : FinalSourceHalfMask)
11065 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11066 M = InputsFixed[1] + SourceOffset;
11067 else if (M == InputsFixed[1] + SourceOffset)
11068 M = (InputsFixed[0] ^ 1) + SourceOffset;
11070 InputsFixed[1] = InputsFixed[0] ^ 1;
11073 // Point everything at the fixed inputs.
11074 for (int &M : HalfMask)
11075 if (M == IncomingInputs[0])
11076 M = InputsFixed[0] + SourceOffset;
11077 else if (M == IncomingInputs[1])
11078 M = InputsFixed[1] + SourceOffset;
11080 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11081 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11084 llvm_unreachable("Unhandled input size!");
11087 // Now hoist the DWord down to the right half.
11088 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11089 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11090 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11091 for (int &M : HalfMask)
11092 for (int Input : IncomingInputs)
11094 M = FreeDWord * 2 + Input % 2;
11096 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11097 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11098 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11099 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11101 // Now enact all the shuffles we've computed to move the inputs into their
11103 if (!isNoopShuffleMask(PSHUFLMask))
11104 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11105 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11106 if (!isNoopShuffleMask(PSHUFHMask))
11107 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11108 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11109 if (!isNoopShuffleMask(PSHUFDMask))
11110 V = DAG.getBitcast(
11112 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11113 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11115 // At this point, each half should contain all its inputs, and we can then
11116 // just shuffle them into their final position.
11117 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11118 "Failed to lift all the high half inputs to the low mask!");
11119 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11120 "Failed to lift all the low half inputs to the high mask!");
11122 // Do a half shuffle for the low mask.
11123 if (!isNoopShuffleMask(LoMask))
11124 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11125 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11127 // Do a half shuffle with the high mask after shifting its values down.
11128 for (int &M : HiMask)
11131 if (!isNoopShuffleMask(HiMask))
11132 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11133 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11138 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11139 /// blend if only one input is used.
11140 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11141 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11142 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11144 SDValue V1Mask[16];
11145 SDValue V2Mask[16];
11149 int Size = Mask.size();
11150 int Scale = 16 / Size;
11151 for (int i = 0; i < 16; ++i) {
11152 if (Mask[i / Scale] < 0) {
11153 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11155 const int ZeroMask = 0x80;
11156 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11158 int V2Idx = Mask[i / Scale] < Size
11160 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11161 if (Zeroable[i / Scale])
11162 V1Idx = V2Idx = ZeroMask;
11163 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11164 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11165 V1InUse |= (ZeroMask != V1Idx);
11166 V2InUse |= (ZeroMask != V2Idx);
11171 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11172 DAG.getBitcast(MVT::v16i8, V1),
11173 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11175 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11176 DAG.getBitcast(MVT::v16i8, V2),
11177 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11179 // If we need shuffled inputs from both, blend the two.
11181 if (V1InUse && V2InUse)
11182 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11184 V = V1InUse ? V1 : V2;
11186 // Cast the result back to the correct type.
11187 return DAG.getBitcast(VT, V);
11190 /// \brief Generic lowering of 8-lane i16 shuffles.
11192 /// This handles both single-input shuffles and combined shuffle/blends with
11193 /// two inputs. The single input shuffles are immediately delegated to
11194 /// a dedicated lowering routine.
11196 /// The blends are lowered in one of three fundamental ways. If there are few
11197 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11198 /// of the input is significantly cheaper when lowered as an interleaving of
11199 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11200 /// halves of the inputs separately (making them have relatively few inputs)
11201 /// and then concatenate them.
11202 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11203 const APInt &Zeroable,
11204 SDValue V1, SDValue V2,
11205 const X86Subtarget &Subtarget,
11206 SelectionDAG &DAG) {
11207 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11208 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11209 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11211 // Whenever we can lower this as a zext, that instruction is strictly faster
11212 // than any alternative.
11213 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11214 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11217 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11219 if (NumV2Inputs == 0) {
11220 // Check for being able to broadcast a single element.
11221 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11222 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11225 // Try to use shift instructions.
11226 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11227 Zeroable, Subtarget, DAG))
11230 // Use dedicated unpack instructions for masks that match their pattern.
11232 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11235 // Try to use byte rotation instructions.
11236 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11237 Mask, Subtarget, DAG))
11240 // Make a copy of the mask so it can be modified.
11241 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11242 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11243 MutableMask, Subtarget,
11247 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11248 "All single-input shuffles should be canonicalized to be V1-input "
11251 // Try to use shift instructions.
11252 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11253 Zeroable, Subtarget, DAG))
11256 // See if we can use SSE4A Extraction / Insertion.
11257 if (Subtarget.hasSSE4A())
11258 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11262 // There are special ways we can lower some single-element blends.
11263 if (NumV2Inputs == 1)
11264 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11265 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11268 // We have different paths for blend lowering, but they all must use the
11269 // *exact* same predicate.
11270 bool IsBlendSupported = Subtarget.hasSSE41();
11271 if (IsBlendSupported)
11272 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11273 Zeroable, Subtarget, DAG))
11276 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11280 // Use dedicated unpack instructions for masks that match their pattern.
11282 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11285 // Try to use byte rotation instructions.
11286 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11287 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11290 if (SDValue BitBlend =
11291 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11294 // Try to lower by permuting the inputs into an unpack instruction.
11295 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11299 // If we can't directly blend but can use PSHUFB, that will be better as it
11300 // can both shuffle and set up the inefficient blend.
11301 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11302 bool V1InUse, V2InUse;
11303 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11304 Zeroable, DAG, V1InUse, V2InUse);
11307 // We can always bit-blend if we have to so the fallback strategy is to
11308 // decompose into single-input permutes and blends.
11309 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11313 /// \brief Check whether a compaction lowering can be done by dropping even
11314 /// elements and compute how many times even elements must be dropped.
11316 /// This handles shuffles which take every Nth element where N is a power of
11317 /// two. Example shuffle masks:
11319 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11320 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11321 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11322 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11323 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11324 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11326 /// Any of these lanes can of course be undef.
11328 /// This routine only supports N <= 3.
11329 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11332 /// \returns N above, or the number of times even elements must be dropped if
11333 /// there is such a number. Otherwise returns zero.
11334 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11335 bool IsSingleInput) {
11336 // The modulus for the shuffle vector entries is based on whether this is
11337 // a single input or not.
11338 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11339 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11340 "We should only be called with masks with a power-of-2 size!");
11342 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11344 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11345 // and 2^3 simultaneously. This is because we may have ambiguity with
11346 // partially undef inputs.
11347 bool ViableForN[3] = {true, true, true};
11349 for (int i = 0, e = Mask.size(); i < e; ++i) {
11350 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11355 bool IsAnyViable = false;
11356 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11357 if (ViableForN[j]) {
11358 uint64_t N = j + 1;
11360 // The shuffle mask must be equal to (i * 2^N) % M.
11361 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11362 IsAnyViable = true;
11364 ViableForN[j] = false;
11366 // Early exit if we exhaust the possible powers of two.
11371 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11375 // Return 0 as there is no viable power of two.
11379 /// \brief Generic lowering of v16i8 shuffles.
11381 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11382 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11383 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11384 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11386 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11387 const APInt &Zeroable,
11388 SDValue V1, SDValue V2,
11389 const X86Subtarget &Subtarget,
11390 SelectionDAG &DAG) {
11391 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11392 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11393 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11395 // Try to use shift instructions.
11396 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11397 Zeroable, Subtarget, DAG))
11400 // Try to use byte rotation instructions.
11401 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11402 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11405 // Try to use a zext lowering.
11406 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11407 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11410 // See if we can use SSE4A Extraction / Insertion.
11411 if (Subtarget.hasSSE4A())
11412 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11416 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11418 // For single-input shuffles, there are some nicer lowering tricks we can use.
11419 if (NumV2Elements == 0) {
11420 // Check for being able to broadcast a single element.
11421 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11422 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11425 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11426 // Notably, this handles splat and partial-splat shuffles more efficiently.
11427 // However, it only makes sense if the pre-duplication shuffle simplifies
11428 // things significantly. Currently, this means we need to be able to
11429 // express the pre-duplication shuffle as an i16 shuffle.
11431 // FIXME: We should check for other patterns which can be widened into an
11432 // i16 shuffle as well.
11433 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11434 for (int i = 0; i < 16; i += 2)
11435 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11440 auto tryToWidenViaDuplication = [&]() -> SDValue {
11441 if (!canWidenViaDuplication(Mask))
11443 SmallVector<int, 4> LoInputs;
11444 copy_if(Mask, std::back_inserter(LoInputs),
11445 [](int M) { return M >= 0 && M < 8; });
11446 std::sort(LoInputs.begin(), LoInputs.end());
11447 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11449 SmallVector<int, 4> HiInputs;
11450 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11451 std::sort(HiInputs.begin(), HiInputs.end());
11452 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11455 bool TargetLo = LoInputs.size() >= HiInputs.size();
11456 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11457 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11459 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11460 SmallDenseMap<int, int, 8> LaneMap;
11461 for (int I : InPlaceInputs) {
11462 PreDupI16Shuffle[I/2] = I/2;
11465 int j = TargetLo ? 0 : 4, je = j + 4;
11466 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11467 // Check if j is already a shuffle of this input. This happens when
11468 // there are two adjacent bytes after we move the low one.
11469 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11470 // If we haven't yet mapped the input, search for a slot into which
11472 while (j < je && PreDupI16Shuffle[j] >= 0)
11476 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11479 // Map this input with the i16 shuffle.
11480 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11483 // Update the lane map based on the mapping we ended up with.
11484 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11486 V1 = DAG.getBitcast(
11488 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11489 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11491 // Unpack the bytes to form the i16s that will be shuffled into place.
11492 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11493 MVT::v16i8, V1, V1);
11495 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11496 for (int i = 0; i < 16; ++i)
11497 if (Mask[i] >= 0) {
11498 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11499 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11500 if (PostDupI16Shuffle[i / 2] < 0)
11501 PostDupI16Shuffle[i / 2] = MappedMask;
11503 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11504 "Conflicting entries in the original shuffle!");
11506 return DAG.getBitcast(
11508 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11509 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11511 if (SDValue V = tryToWidenViaDuplication())
11515 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11519 // Use dedicated unpack instructions for masks that match their pattern.
11521 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11524 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11525 // with PSHUFB. It is important to do this before we attempt to generate any
11526 // blends but after all of the single-input lowerings. If the single input
11527 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11528 // want to preserve that and we can DAG combine any longer sequences into
11529 // a PSHUFB in the end. But once we start blending from multiple inputs,
11530 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11531 // and there are *very* few patterns that would actually be faster than the
11532 // PSHUFB approach because of its ability to zero lanes.
11534 // FIXME: The only exceptions to the above are blends which are exact
11535 // interleavings with direct instructions supporting them. We currently don't
11536 // handle those well here.
11537 if (Subtarget.hasSSSE3()) {
11538 bool V1InUse = false;
11539 bool V2InUse = false;
11541 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11542 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11544 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11545 // do so. This avoids using them to handle blends-with-zero which is
11546 // important as a single pshufb is significantly faster for that.
11547 if (V1InUse && V2InUse) {
11548 if (Subtarget.hasSSE41())
11549 if (SDValue Blend = lowerVectorShuffleAsBlend(
11550 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11553 // We can use an unpack to do the blending rather than an or in some
11554 // cases. Even though the or may be (very minorly) more efficient, we
11555 // preference this lowering because there are common cases where part of
11556 // the complexity of the shuffles goes away when we do the final blend as
11558 // FIXME: It might be worth trying to detect if the unpack-feeding
11559 // shuffles will both be pshufb, in which case we shouldn't bother with
11561 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11562 DL, MVT::v16i8, V1, V2, Mask, DAG))
11569 // There are special ways we can lower some single-element blends.
11570 if (NumV2Elements == 1)
11571 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11572 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11575 if (SDValue BitBlend =
11576 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11579 // Check whether a compaction lowering can be done. This handles shuffles
11580 // which take every Nth element for some even N. See the helper function for
11583 // We special case these as they can be particularly efficiently handled with
11584 // the PACKUSB instruction on x86 and they show up in common patterns of
11585 // rearranging bytes to truncate wide elements.
11586 bool IsSingleInput = V2.isUndef();
11587 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11588 // NumEvenDrops is the power of two stride of the elements. Another way of
11589 // thinking about it is that we need to drop the even elements this many
11590 // times to get the original input.
11592 // First we need to zero all the dropped bytes.
11593 assert(NumEvenDrops <= 3 &&
11594 "No support for dropping even elements more than 3 times.");
11595 // We use the mask type to pick which bytes are preserved based on how many
11596 // elements are dropped.
11597 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11598 SDValue ByteClearMask = DAG.getBitcast(
11599 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11600 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11601 if (!IsSingleInput)
11602 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11604 // Now pack things back together.
11605 V1 = DAG.getBitcast(MVT::v8i16, V1);
11606 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11607 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11608 for (int i = 1; i < NumEvenDrops; ++i) {
11609 Result = DAG.getBitcast(MVT::v8i16, Result);
11610 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11616 // Handle multi-input cases by blending single-input shuffles.
11617 if (NumV2Elements > 0)
11618 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11621 // The fallback path for single-input shuffles widens this into two v8i16
11622 // vectors with unpacks, shuffles those, and then pulls them back together
11626 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11627 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11628 for (int i = 0; i < 16; ++i)
11630 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11632 SDValue VLoHalf, VHiHalf;
11633 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11634 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11636 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11637 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11638 // Use a mask to drop the high bytes.
11639 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11640 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11641 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11643 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11644 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11646 // Squash the masks to point directly into VLoHalf.
11647 for (int &M : LoBlendMask)
11650 for (int &M : HiBlendMask)
11654 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11655 // VHiHalf so that we can blend them as i16s.
11656 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11658 VLoHalf = DAG.getBitcast(
11659 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11660 VHiHalf = DAG.getBitcast(
11661 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11664 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11665 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11667 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11670 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11672 /// This routine breaks down the specific type of 128-bit shuffle and
11673 /// dispatches to the lowering routines accordingly.
11674 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11675 MVT VT, SDValue V1, SDValue V2,
11676 const APInt &Zeroable,
11677 const X86Subtarget &Subtarget,
11678 SelectionDAG &DAG) {
11679 switch (VT.SimpleTy) {
11681 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11683 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11685 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11687 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11689 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11691 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11694 llvm_unreachable("Unimplemented!");
11698 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11700 /// This routine just extracts two subvectors, shuffles them independently, and
11701 /// then concatenates them back together. This should work effectively with all
11702 /// AVX vector shuffle types.
11703 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11704 SDValue V2, ArrayRef<int> Mask,
11705 SelectionDAG &DAG) {
11706 assert(VT.getSizeInBits() >= 256 &&
11707 "Only for 256-bit or wider vector shuffles!");
11708 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11709 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11711 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11712 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11714 int NumElements = VT.getVectorNumElements();
11715 int SplitNumElements = NumElements / 2;
11716 MVT ScalarVT = VT.getVectorElementType();
11717 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11719 // Rather than splitting build-vectors, just build two narrower build
11720 // vectors. This helps shuffling with splats and zeros.
11721 auto SplitVector = [&](SDValue V) {
11722 V = peekThroughBitcasts(V);
11724 MVT OrigVT = V.getSimpleValueType();
11725 int OrigNumElements = OrigVT.getVectorNumElements();
11726 int OrigSplitNumElements = OrigNumElements / 2;
11727 MVT OrigScalarVT = OrigVT.getVectorElementType();
11728 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11732 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11734 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11735 DAG.getIntPtrConstant(0, DL));
11736 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11737 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11740 SmallVector<SDValue, 16> LoOps, HiOps;
11741 for (int i = 0; i < OrigSplitNumElements; ++i) {
11742 LoOps.push_back(BV->getOperand(i));
11743 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11745 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11746 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11748 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11749 DAG.getBitcast(SplitVT, HiV));
11752 SDValue LoV1, HiV1, LoV2, HiV2;
11753 std::tie(LoV1, HiV1) = SplitVector(V1);
11754 std::tie(LoV2, HiV2) = SplitVector(V2);
11756 // Now create two 4-way blends of these half-width vectors.
11757 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11758 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11759 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11760 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11761 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11762 for (int i = 0; i < SplitNumElements; ++i) {
11763 int M = HalfMask[i];
11764 if (M >= NumElements) {
11765 if (M >= NumElements + SplitNumElements)
11769 V2BlendMask[i] = M - NumElements;
11770 BlendMask[i] = SplitNumElements + i;
11771 } else if (M >= 0) {
11772 if (M >= SplitNumElements)
11776 V1BlendMask[i] = M;
11781 // Because the lowering happens after all combining takes place, we need to
11782 // manually combine these blend masks as much as possible so that we create
11783 // a minimal number of high-level vector shuffle nodes.
11785 // First try just blending the halves of V1 or V2.
11786 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11787 return DAG.getUNDEF(SplitVT);
11788 if (!UseLoV2 && !UseHiV2)
11789 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11790 if (!UseLoV1 && !UseHiV1)
11791 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11793 SDValue V1Blend, V2Blend;
11794 if (UseLoV1 && UseHiV1) {
11796 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11798 // We only use half of V1 so map the usage down into the final blend mask.
11799 V1Blend = UseLoV1 ? LoV1 : HiV1;
11800 for (int i = 0; i < SplitNumElements; ++i)
11801 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11802 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11804 if (UseLoV2 && UseHiV2) {
11806 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11808 // We only use half of V2 so map the usage down into the final blend mask.
11809 V2Blend = UseLoV2 ? LoV2 : HiV2;
11810 for (int i = 0; i < SplitNumElements; ++i)
11811 if (BlendMask[i] >= SplitNumElements)
11812 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11814 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11816 SDValue Lo = HalfBlend(LoMask);
11817 SDValue Hi = HalfBlend(HiMask);
11818 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11821 /// \brief Either split a vector in halves or decompose the shuffles and the
11824 /// This is provided as a good fallback for many lowerings of non-single-input
11825 /// shuffles with more than one 128-bit lane. In those cases, we want to select
11826 /// between splitting the shuffle into 128-bit components and stitching those
11827 /// back together vs. extracting the single-input shuffles and blending those
11829 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11830 SDValue V1, SDValue V2,
11831 ArrayRef<int> Mask,
11832 SelectionDAG &DAG) {
11833 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11834 "shuffles as it could then recurse on itself.");
11835 int Size = Mask.size();
11837 // If this can be modeled as a broadcast of two elements followed by a blend,
11838 // prefer that lowering. This is especially important because broadcasts can
11839 // often fold with memory operands.
11840 auto DoBothBroadcast = [&] {
11841 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11844 if (V2BroadcastIdx < 0)
11845 V2BroadcastIdx = M - Size;
11846 else if (M - Size != V2BroadcastIdx)
11848 } else if (M >= 0) {
11849 if (V1BroadcastIdx < 0)
11850 V1BroadcastIdx = M;
11851 else if (M != V1BroadcastIdx)
11856 if (DoBothBroadcast())
11857 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11860 // If the inputs all stem from a single 128-bit lane of each input, then we
11861 // split them rather than blending because the split will decompose to
11862 // unusually few instructions.
11863 int LaneCount = VT.getSizeInBits() / 128;
11864 int LaneSize = Size / LaneCount;
11865 SmallBitVector LaneInputs[2];
11866 LaneInputs[0].resize(LaneCount, false);
11867 LaneInputs[1].resize(LaneCount, false);
11868 for (int i = 0; i < Size; ++i)
11870 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11871 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
11872 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11874 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
11875 // that the decomposed single-input shuffles don't end up here.
11876 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
11879 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
11880 /// a permutation and blend of those lanes.
11882 /// This essentially blends the out-of-lane inputs to each lane into the lane
11883 /// from a permuted copy of the vector. This lowering strategy results in four
11884 /// instructions in the worst case for a single-input cross lane shuffle which
11885 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
11886 /// of. Special cases for each particular shuffle pattern should be handled
11887 /// prior to trying this lowering.
11888 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
11889 SDValue V1, SDValue V2,
11890 ArrayRef<int> Mask,
11891 SelectionDAG &DAG) {
11892 // FIXME: This should probably be generalized for 512-bit vectors as well.
11893 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
11894 int Size = Mask.size();
11895 int LaneSize = Size / 2;
11897 // If there are only inputs from one 128-bit lane, splitting will in fact be
11898 // less expensive. The flags track whether the given lane contains an element
11899 // that crosses to another lane.
11900 bool LaneCrossing[2] = {false, false};
11901 for (int i = 0; i < Size; ++i)
11902 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11903 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
11904 if (!LaneCrossing[0] || !LaneCrossing[1])
11905 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11907 assert(V2.isUndef() &&
11908 "This last part of this routine only works on single input shuffles");
11910 SmallVector<int, 32> FlippedBlendMask(Size);
11911 for (int i = 0; i < Size; ++i)
11912 FlippedBlendMask[i] =
11913 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
11915 : Mask[i] % LaneSize +
11916 (i / LaneSize) * LaneSize + Size);
11918 // Flip the vector, and blend the results which should now be in-lane. The
11919 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
11920 // 5 for the high source. The value 3 selects the high half of source 2 and
11921 // the value 2 selects the low half of source 2. We only use source 2 to
11922 // allow folding it into a memory operand.
11923 unsigned PERMMask = 3 | 2 << 4;
11924 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
11925 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
11926 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
11929 /// \brief Handle lowering 2-lane 128-bit shuffles.
11930 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11931 SDValue V2, ArrayRef<int> Mask,
11932 const APInt &Zeroable,
11933 const X86Subtarget &Subtarget,
11934 SelectionDAG &DAG) {
11935 SmallVector<int, 4> WidenedMask;
11936 if (!canWidenShuffleElements(Mask, WidenedMask))
11939 // TODO: If minimizing size and one of the inputs is a zero vector and the
11940 // the zero vector has only one use, we could use a VPERM2X128 to save the
11941 // instruction bytes needed to explicitly generate the zero vector.
11943 // Blends are faster and handle all the non-lane-crossing cases.
11944 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
11945 Zeroable, Subtarget, DAG))
11948 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
11949 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
11951 // If either input operand is a zero vector, use VPERM2X128 because its mask
11952 // allows us to replace the zero input with an implicit zero.
11953 if (!IsV1Zero && !IsV2Zero) {
11954 // Check for patterns which can be matched with a single insert of a 128-bit
11956 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
11957 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
11958 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
11959 if (Subtarget.hasAVX2() && V2.isUndef())
11962 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
11963 VT.getVectorNumElements() / 2);
11964 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
11965 DAG.getIntPtrConstant(0, DL));
11966 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
11967 OnlyUsesV1 ? V1 : V2,
11968 DAG.getIntPtrConstant(0, DL));
11969 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
11973 // Otherwise form a 128-bit permutation. After accounting for undefs,
11974 // convert the 64-bit shuffle mask selection values into 128-bit
11975 // selection bits by dividing the indexes by 2 and shifting into positions
11976 // defined by a vperm2*128 instruction's immediate control byte.
11978 // The immediate permute control byte looks like this:
11979 // [1:0] - select 128 bits from sources for low half of destination
11981 // [3] - zero low half of destination
11982 // [5:4] - select 128 bits from sources for high half of destination
11984 // [7] - zero high half of destination
11986 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
11987 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
11989 unsigned PermMask = MaskLO | (MaskHI << 4);
11991 // If either input is a zero vector, replace it with an undef input.
11992 // Shuffle mask values < 4 are selecting elements of V1.
11993 // Shuffle mask values >= 4 are selecting elements of V2.
11994 // Adjust each half of the permute mask by clearing the half that was
11995 // selecting the zero vector and setting the zero mask bit.
11997 V1 = DAG.getUNDEF(VT);
11999 PermMask = (PermMask & 0xf0) | 0x08;
12001 PermMask = (PermMask & 0x0f) | 0x80;
12004 V2 = DAG.getUNDEF(VT);
12006 PermMask = (PermMask & 0xf0) | 0x08;
12008 PermMask = (PermMask & 0x0f) | 0x80;
12011 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12012 DAG.getConstant(PermMask, DL, MVT::i8));
12015 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12016 /// shuffling each lane.
12018 /// This will only succeed when the result of fixing the 128-bit lanes results
12019 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12020 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12021 /// the lane crosses early and then use simpler shuffles within each lane.
12023 /// FIXME: It might be worthwhile at some point to support this without
12024 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12025 /// in x86 only floating point has interesting non-repeating shuffles, and even
12026 /// those are still *marginally* more expensive.
12027 static SDValue lowerVectorShuffleByMerging128BitLanes(
12028 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12029 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12030 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12032 int Size = Mask.size();
12033 int LaneSize = 128 / VT.getScalarSizeInBits();
12034 int NumLanes = Size / LaneSize;
12035 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12037 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12038 // check whether the in-128-bit lane shuffles share a repeating pattern.
12039 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12040 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12041 for (int i = 0; i < Size; ++i) {
12045 int j = i / LaneSize;
12047 if (Lanes[j] < 0) {
12048 // First entry we've seen for this lane.
12049 Lanes[j] = Mask[i] / LaneSize;
12050 } else if (Lanes[j] != Mask[i] / LaneSize) {
12051 // This doesn't match the lane selected previously!
12055 // Check that within each lane we have a consistent shuffle mask.
12056 int k = i % LaneSize;
12057 if (InLaneMask[k] < 0) {
12058 InLaneMask[k] = Mask[i] % LaneSize;
12059 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12060 // This doesn't fit a repeating in-lane mask.
12065 // First shuffle the lanes into place.
12066 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12067 VT.getSizeInBits() / 64);
12068 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12069 for (int i = 0; i < NumLanes; ++i)
12070 if (Lanes[i] >= 0) {
12071 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12072 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12075 V1 = DAG.getBitcast(LaneVT, V1);
12076 V2 = DAG.getBitcast(LaneVT, V2);
12077 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12079 // Cast it back to the type we actually want.
12080 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12082 // Now do a simple shuffle that isn't lane crossing.
12083 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12084 for (int i = 0; i < Size; ++i)
12086 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12087 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12088 "Must not introduce lane crosses at this point!");
12090 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12093 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12094 /// This allows for fast cases such as subvector extraction/insertion
12095 /// or shuffling smaller vector types which can lower more efficiently.
12096 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12097 SDValue V1, SDValue V2,
12098 ArrayRef<int> Mask,
12099 const X86Subtarget &Subtarget,
12100 SelectionDAG &DAG) {
12101 assert(VT.is256BitVector() && "Expected 256-bit vector");
12103 unsigned NumElts = VT.getVectorNumElements();
12104 unsigned HalfNumElts = NumElts / 2;
12105 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12107 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12108 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12109 if (!UndefLower && !UndefUpper)
12112 // Upper half is undef and lower half is whole upper subvector.
12113 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12115 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12116 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12117 DAG.getIntPtrConstant(HalfNumElts, DL));
12118 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12119 DAG.getIntPtrConstant(0, DL));
12122 // Lower half is undef and upper half is whole lower subvector.
12123 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12125 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12126 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12127 DAG.getIntPtrConstant(0, DL));
12128 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12129 DAG.getIntPtrConstant(HalfNumElts, DL));
12132 // If the shuffle only uses two of the four halves of the input operands,
12133 // then extract them and perform the 'half' shuffle at half width.
12134 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12135 int HalfIdx1 = -1, HalfIdx2 = -1;
12136 SmallVector<int, 8> HalfMask(HalfNumElts);
12137 unsigned Offset = UndefLower ? HalfNumElts : 0;
12138 for (unsigned i = 0; i != HalfNumElts; ++i) {
12139 int M = Mask[i + Offset];
12145 // Determine which of the 4 half vectors this element is from.
12146 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12147 int HalfIdx = M / HalfNumElts;
12149 // Determine the element index into its half vector source.
12150 int HalfElt = M % HalfNumElts;
12152 // We can shuffle with up to 2 half vectors, set the new 'half'
12153 // shuffle mask accordingly.
12154 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12155 HalfMask[i] = HalfElt;
12156 HalfIdx1 = HalfIdx;
12159 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12160 HalfMask[i] = HalfElt + HalfNumElts;
12161 HalfIdx2 = HalfIdx;
12165 // Too many half vectors referenced.
12168 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12170 // Only shuffle the halves of the inputs when useful.
12171 int NumLowerHalves =
12172 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12173 int NumUpperHalves =
12174 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12176 // uuuuXXXX - don't extract uppers just to insert again.
12177 if (UndefLower && NumUpperHalves != 0)
12180 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12181 if (UndefUpper && NumUpperHalves == 2)
12184 // AVX2 - XXXXuuuu - always extract lowers.
12185 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12186 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12187 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12189 // AVX2 supports variable 32-bit element cross-lane shuffles.
12190 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12191 // XXXXuuuu - don't extract lowers and uppers.
12192 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12197 auto GetHalfVector = [&](int HalfIdx) {
12199 return DAG.getUNDEF(HalfVT);
12200 SDValue V = (HalfIdx < 2 ? V1 : V2);
12201 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12202 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12203 DAG.getIntPtrConstant(HalfIdx, DL));
12206 SDValue Half1 = GetHalfVector(HalfIdx1);
12207 SDValue Half2 = GetHalfVector(HalfIdx2);
12208 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12209 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12210 DAG.getIntPtrConstant(Offset, DL));
12213 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12216 /// This returns true if the elements from a particular input are already in the
12217 /// slot required by the given mask and require no permutation.
12218 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12219 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12220 int Size = Mask.size();
12221 for (int i = 0; i < Size; ++i)
12222 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12228 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12229 /// every lane can be represented as the same repeating mask - allowing us to
12230 /// shuffle the sources with the repeating shuffle and then permute the result
12231 /// to the destination lanes.
12232 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12233 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12234 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12235 int NumElts = VT.getVectorNumElements();
12236 int NumLanes = VT.getSizeInBits() / 128;
12237 int NumLaneElts = NumElts / NumLanes;
12239 // On AVX2 we may be able to just shuffle the lowest elements and then
12240 // broadcast the result.
12241 if (Subtarget.hasAVX2()) {
12242 for (unsigned BroadcastSize : {16, 32, 64}) {
12243 if (BroadcastSize <= VT.getScalarSizeInBits())
12245 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12247 // Attempt to match a repeating pattern every NumBroadcastElts,
12248 // accounting for UNDEFs but only references the lowest 128-bit
12249 // lane of the inputs.
12250 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12251 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12252 for (int j = 0; j != NumBroadcastElts; ++j) {
12253 int M = Mask[i + j];
12256 int &R = RepeatMask[j];
12257 if (0 != ((M % NumElts) / NumLaneElts))
12259 if (0 <= R && R != M)
12266 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12267 if (!FindRepeatingBroadcastMask(RepeatMask))
12270 // Shuffle the (lowest) repeated elements in place for broadcast.
12271 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12273 // Shuffle the actual broadcast.
12274 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12275 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12276 for (int j = 0; j != NumBroadcastElts; ++j)
12277 BroadcastMask[i + j] = j;
12278 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12283 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12284 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12287 // Bail if we already have a repeated lane shuffle mask.
12288 SmallVector<int, 8> RepeatedShuffleMask;
12289 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12292 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12293 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12294 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12295 int NumSubLanes = NumLanes * SubLaneScale;
12296 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12298 // Check that all the sources are coming from the same lane and see if we can
12299 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12300 // determine the source sub-lane for each destination sub-lane.
12301 int TopSrcSubLane = -1;
12302 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12303 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12304 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12305 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12307 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12308 // Extract the sub-lane mask, check that it all comes from the same lane
12309 // and normalize the mask entries to come from the first lane.
12311 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12312 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12313 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12316 int Lane = (M % NumElts) / NumLaneElts;
12317 if ((0 <= SrcLane) && (SrcLane != Lane))
12320 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12321 SubLaneMask[Elt] = LocalM;
12324 // Whole sub-lane is UNDEF.
12328 // Attempt to match against the candidate repeated sub-lane masks.
12329 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12330 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12331 for (int i = 0; i != NumSubLaneElts; ++i) {
12332 if (M1[i] < 0 || M2[i] < 0)
12334 if (M1[i] != M2[i])
12340 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12341 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12344 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12345 for (int i = 0; i != NumSubLaneElts; ++i) {
12346 int M = SubLaneMask[i];
12349 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12350 "Unexpected mask element");
12351 RepeatedSubLaneMask[i] = M;
12354 // Track the top most source sub-lane - by setting the remaining to UNDEF
12355 // we can greatly simplify shuffle matching.
12356 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12357 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12358 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12362 // Bail if we failed to find a matching repeated sub-lane mask.
12363 if (Dst2SrcSubLanes[DstSubLane] < 0)
12366 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12367 "Unexpected source lane");
12369 // Create a repeating shuffle mask for the entire vector.
12370 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12371 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12372 int Lane = SubLane / SubLaneScale;
12373 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12374 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12375 int M = RepeatedSubLaneMask[Elt];
12378 int Idx = (SubLane * NumSubLaneElts) + Elt;
12379 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12382 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12384 // Shuffle each source sub-lane to its destination.
12385 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12386 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12387 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12388 if (SrcSubLane < 0)
12390 for (int j = 0; j != NumSubLaneElts; ++j)
12391 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12394 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12398 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12399 unsigned &ShuffleImm,
12400 ArrayRef<int> Mask) {
12401 int NumElts = VT.getVectorNumElements();
12402 assert(VT.getScalarSizeInBits() == 64 &&
12403 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12404 "Unexpected data type for VSHUFPD");
12406 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12407 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12409 bool ShufpdMask = true;
12410 bool CommutableMask = true;
12411 for (int i = 0; i < NumElts; ++i) {
12412 if (Mask[i] == SM_SentinelUndef)
12416 int Val = (i & 6) + NumElts * (i & 1);
12417 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12418 if (Mask[i] < Val || Mask[i] > Val + 1)
12419 ShufpdMask = false;
12420 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12421 CommutableMask = false;
12422 ShuffleImm |= (Mask[i] % 2) << i;
12427 if (CommutableMask) {
12435 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12436 ArrayRef<int> Mask, SDValue V1,
12437 SDValue V2, SelectionDAG &DAG) {
12438 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12439 "Unexpected data type for VSHUFPD");
12441 unsigned Immediate = 0;
12442 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12445 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12446 DAG.getConstant(Immediate, DL, MVT::i8));
12449 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12450 ArrayRef<int> Mask, SDValue V1,
12451 SDValue V2, SelectionDAG &DAG) {
12452 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12453 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12455 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12457 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12459 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12462 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12464 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12465 /// isn't available.
12466 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12467 const APInt &Zeroable,
12468 SDValue V1, SDValue V2,
12469 const X86Subtarget &Subtarget,
12470 SelectionDAG &DAG) {
12471 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12472 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12473 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12475 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12476 Zeroable, Subtarget, DAG))
12479 if (V2.isUndef()) {
12480 // Check for being able to broadcast a single element.
12481 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12482 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12485 // Use low duplicate instructions for masks that match their pattern.
12486 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12487 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12489 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12490 // Non-half-crossing single input shuffles can be lowered with an
12491 // interleaved permutation.
12492 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12493 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12494 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12495 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12498 // With AVX2 we have direct support for this permutation.
12499 if (Subtarget.hasAVX2())
12500 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12501 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12503 // Try to create an in-lane repeating shuffle mask and then shuffle the
12504 // the results into the target lanes.
12505 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12506 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12509 // Otherwise, fall back.
12510 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12514 // Use dedicated unpack instructions for masks that match their pattern.
12516 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12519 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12520 Zeroable, Subtarget, DAG))
12523 // Check if the blend happens to exactly fit that of SHUFPD.
12525 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12528 // Try to create an in-lane repeating shuffle mask and then shuffle the
12529 // the results into the target lanes.
12530 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12531 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12534 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12535 // shuffle. However, if we have AVX2 and either inputs are already in place,
12536 // we will be able to shuffle even across lanes the other input in a single
12537 // instruction so skip this pattern.
12538 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12539 isShuffleMaskInputInPlace(1, Mask))))
12540 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12541 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12543 // If we have VLX support, we can use VEXPAND.
12544 if (Subtarget.hasVLX())
12545 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12546 V1, V2, DAG, Subtarget))
12549 // If we have AVX2 then we always want to lower with a blend because an v4 we
12550 // can fully permute the elements.
12551 if (Subtarget.hasAVX2())
12552 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12555 // Otherwise fall back on generic lowering.
12556 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12559 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12561 /// This routine is only called when we have AVX2 and thus a reasonable
12562 /// instruction set for v4i64 shuffling..
12563 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12564 const APInt &Zeroable,
12565 SDValue V1, SDValue V2,
12566 const X86Subtarget &Subtarget,
12567 SelectionDAG &DAG) {
12568 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12569 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12570 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12571 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12573 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12574 Zeroable, Subtarget, DAG))
12577 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12578 Zeroable, Subtarget, DAG))
12581 // Check for being able to broadcast a single element.
12582 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12583 Mask, Subtarget, DAG))
12586 if (V2.isUndef()) {
12587 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12588 // can use lower latency instructions that will operate on both lanes.
12589 SmallVector<int, 2> RepeatedMask;
12590 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12591 SmallVector<int, 4> PSHUFDMask;
12592 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12593 return DAG.getBitcast(
12595 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12596 DAG.getBitcast(MVT::v8i32, V1),
12597 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12600 // AVX2 provides a direct instruction for permuting a single input across
12602 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12603 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12606 // Try to use shift instructions.
12607 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12608 Zeroable, Subtarget, DAG))
12611 // If we have VLX support, we can use VALIGN or VEXPAND.
12612 if (Subtarget.hasVLX()) {
12613 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12614 Mask, Subtarget, DAG))
12617 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12618 V1, V2, DAG, Subtarget))
12622 // Try to use PALIGNR.
12623 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12624 Mask, Subtarget, DAG))
12627 // Use dedicated unpack instructions for masks that match their pattern.
12629 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12632 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12633 // shuffle. However, if we have AVX2 and either inputs are already in place,
12634 // we will be able to shuffle even across lanes the other input in a single
12635 // instruction so skip this pattern.
12636 if (!isShuffleMaskInputInPlace(0, Mask) &&
12637 !isShuffleMaskInputInPlace(1, Mask))
12638 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12639 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12642 // Otherwise fall back on generic blend lowering.
12643 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12647 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12649 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12650 /// isn't available.
12651 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12652 const APInt &Zeroable,
12653 SDValue V1, SDValue V2,
12654 const X86Subtarget &Subtarget,
12655 SelectionDAG &DAG) {
12656 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12657 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12658 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12660 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12661 Zeroable, Subtarget, DAG))
12664 // Check for being able to broadcast a single element.
12665 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12666 Mask, Subtarget, DAG))
12669 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12670 // options to efficiently lower the shuffle.
12671 SmallVector<int, 4> RepeatedMask;
12672 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12673 assert(RepeatedMask.size() == 4 &&
12674 "Repeated masks must be half the mask width!");
12676 // Use even/odd duplicate instructions for masks that match their pattern.
12677 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12678 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12679 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12680 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12683 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12684 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12686 // Use dedicated unpack instructions for masks that match their pattern.
12688 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12691 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12692 // have already handled any direct blends.
12693 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12696 // Try to create an in-lane repeating shuffle mask and then shuffle the
12697 // the results into the target lanes.
12698 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12699 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12702 // If we have a single input shuffle with different shuffle patterns in the
12703 // two 128-bit lanes use the variable mask to VPERMILPS.
12704 if (V2.isUndef()) {
12705 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12706 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12707 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12709 if (Subtarget.hasAVX2())
12710 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12712 // Otherwise, fall back.
12713 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12717 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12719 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12720 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12722 // If we have VLX support, we can use VEXPAND.
12723 if (Subtarget.hasVLX())
12724 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12725 V1, V2, DAG, Subtarget))
12728 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12729 // since after split we get a more efficient code using vpunpcklwd and
12730 // vpunpckhwd instrs than vblend.
12731 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12732 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12736 // If we have AVX2 then we always want to lower with a blend because at v8 we
12737 // can fully permute the elements.
12738 if (Subtarget.hasAVX2())
12739 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12742 // Otherwise fall back on generic lowering.
12743 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12746 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12748 /// This routine is only called when we have AVX2 and thus a reasonable
12749 /// instruction set for v8i32 shuffling..
12750 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12751 const APInt &Zeroable,
12752 SDValue V1, SDValue V2,
12753 const X86Subtarget &Subtarget,
12754 SelectionDAG &DAG) {
12755 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12756 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12757 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12758 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12760 // Whenever we can lower this as a zext, that instruction is strictly faster
12761 // than any alternative. It also allows us to fold memory operands into the
12762 // shuffle in many cases.
12763 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12764 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12767 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12768 // since after split we get a more efficient code than vblend by using
12769 // vpunpcklwd and vpunpckhwd instrs.
12770 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12771 !Subtarget.hasAVX512())
12773 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12776 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12777 Zeroable, Subtarget, DAG))
12780 // Check for being able to broadcast a single element.
12781 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12782 Mask, Subtarget, DAG))
12785 // If the shuffle mask is repeated in each 128-bit lane we can use more
12786 // efficient instructions that mirror the shuffles across the two 128-bit
12788 SmallVector<int, 4> RepeatedMask;
12789 bool Is128BitLaneRepeatedShuffle =
12790 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12791 if (Is128BitLaneRepeatedShuffle) {
12792 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12794 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12795 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12797 // Use dedicated unpack instructions for masks that match their pattern.
12799 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12803 // Try to use shift instructions.
12804 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12805 Zeroable, Subtarget, DAG))
12808 // If we have VLX support, we can use VALIGN or EXPAND.
12809 if (Subtarget.hasVLX()) {
12810 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12811 Mask, Subtarget, DAG))
12814 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
12815 V1, V2, DAG, Subtarget))
12819 // Try to use byte rotation instructions.
12820 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12821 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12824 // Try to create an in-lane repeating shuffle mask and then shuffle the
12825 // results into the target lanes.
12826 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12827 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12830 // If the shuffle patterns aren't repeated but it is a single input, directly
12831 // generate a cross-lane VPERMD instruction.
12832 if (V2.isUndef()) {
12833 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12834 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12837 // Assume that a single SHUFPS is faster than an alternative sequence of
12838 // multiple instructions (even if the CPU has a domain penalty).
12839 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12840 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12841 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12842 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12843 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12844 CastV1, CastV2, DAG);
12845 return DAG.getBitcast(MVT::v8i32, ShufPS);
12848 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12850 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12851 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12854 // Otherwise fall back on generic blend lowering.
12855 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12859 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12861 /// This routine is only called when we have AVX2 and thus a reasonable
12862 /// instruction set for v16i16 shuffling..
12863 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12864 const APInt &Zeroable,
12865 SDValue V1, SDValue V2,
12866 const X86Subtarget &Subtarget,
12867 SelectionDAG &DAG) {
12868 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12869 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12870 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12871 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
12873 // Whenever we can lower this as a zext, that instruction is strictly faster
12874 // than any alternative. It also allows us to fold memory operands into the
12875 // shuffle in many cases.
12876 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12877 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12880 // Check for being able to broadcast a single element.
12881 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
12882 Mask, Subtarget, DAG))
12885 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
12886 Zeroable, Subtarget, DAG))
12889 // Use dedicated unpack instructions for masks that match their pattern.
12891 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
12894 // Try to use shift instructions.
12895 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12896 Zeroable, Subtarget, DAG))
12899 // Try to use byte rotation instructions.
12900 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12901 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12904 // Try to create an in-lane repeating shuffle mask and then shuffle the
12905 // the results into the target lanes.
12906 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12907 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12910 if (V2.isUndef()) {
12911 // There are no generalized cross-lane shuffle operations available on i16
12913 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
12914 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
12917 SmallVector<int, 8> RepeatedMask;
12918 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12919 // As this is a single-input shuffle, the repeated mask should be
12920 // a strictly valid v8i16 mask that we can pass through to the v8i16
12921 // lowering to handle even the v16 case.
12922 return lowerV8I16GeneralSingleInputVectorShuffle(
12923 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
12927 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12928 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
12931 // AVX512BWVL can lower to VPERMW.
12932 if (Subtarget.hasBWI() && Subtarget.hasVLX())
12933 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
12935 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12937 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12938 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12941 // Otherwise fall back on generic lowering.
12942 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
12945 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
12947 /// This routine is only called when we have AVX2 and thus a reasonable
12948 /// instruction set for v32i8 shuffling..
12949 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12950 const APInt &Zeroable,
12951 SDValue V1, SDValue V2,
12952 const X86Subtarget &Subtarget,
12953 SelectionDAG &DAG) {
12954 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12955 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12956 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12957 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
12959 // Whenever we can lower this as a zext, that instruction is strictly faster
12960 // than any alternative. It also allows us to fold memory operands into the
12961 // shuffle in many cases.
12962 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12963 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12966 // Check for being able to broadcast a single element.
12967 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
12968 Mask, Subtarget, DAG))
12971 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
12972 Zeroable, Subtarget, DAG))
12975 // Use dedicated unpack instructions for masks that match their pattern.
12977 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
12980 // Try to use shift instructions.
12981 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
12982 Zeroable, Subtarget, DAG))
12985 // Try to use byte rotation instructions.
12986 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12987 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12990 // Try to create an in-lane repeating shuffle mask and then shuffle the
12991 // the results into the target lanes.
12992 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12993 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12996 // There are no generalized cross-lane shuffle operations available on i8
12998 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
12999 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13002 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13003 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13006 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13008 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13009 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13012 // Otherwise fall back on generic lowering.
13013 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13016 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13018 /// This routine either breaks down the specific type of a 256-bit x86 vector
13019 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13020 /// together based on the available instructions.
13021 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13022 MVT VT, SDValue V1, SDValue V2,
13023 const APInt &Zeroable,
13024 const X86Subtarget &Subtarget,
13025 SelectionDAG &DAG) {
13026 // If we have a single input to the zero element, insert that into V1 if we
13027 // can do so cheaply.
13028 int NumElts = VT.getVectorNumElements();
13029 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13031 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13032 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13033 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13036 // Handle special cases where the lower or upper half is UNDEF.
13038 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13041 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13042 // can check for those subtargets here and avoid much of the subtarget
13043 // querying in the per-vector-type lowering routines. With AVX1 we have
13044 // essentially *zero* ability to manipulate a 256-bit vector with integer
13045 // types. Since we'll use floating point types there eventually, just
13046 // immediately cast everything to a float and operate entirely in that domain.
13047 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13048 int ElementBits = VT.getScalarSizeInBits();
13049 if (ElementBits < 32) {
13050 // No floating point type available, if we can't use the bit operations
13051 // for masking/blending then decompose into 128-bit vectors.
13053 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13055 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13057 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13060 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13061 VT.getVectorNumElements());
13062 V1 = DAG.getBitcast(FpVT, V1);
13063 V2 = DAG.getBitcast(FpVT, V2);
13064 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13067 switch (VT.SimpleTy) {
13069 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13071 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13073 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13075 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13077 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13079 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13082 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13086 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13087 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13088 ArrayRef<int> Mask, SDValue V1,
13089 SDValue V2, SelectionDAG &DAG) {
13090 assert(VT.getScalarSizeInBits() == 64 &&
13091 "Unexpected element type size for 128bit shuffle.");
13093 // To handle 256 bit vector requires VLX and most probably
13094 // function lowerV2X128VectorShuffle() is better solution.
13095 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13097 SmallVector<int, 4> WidenedMask;
13098 if (!canWidenShuffleElements(Mask, WidenedMask))
13101 // Check for patterns which can be matched with a single insert of a 256-bit
13103 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13104 {0, 1, 2, 3, 0, 1, 2, 3});
13105 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13106 {0, 1, 2, 3, 8, 9, 10, 11})) {
13107 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13108 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13109 DAG.getIntPtrConstant(0, DL));
13110 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13111 OnlyUsesV1 ? V1 : V2,
13112 DAG.getIntPtrConstant(0, DL));
13113 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13116 assert(WidenedMask.size() == 4);
13118 // See if this is an insertion of the lower 128-bits of V2 into V1.
13119 bool IsInsert = true;
13121 for (int i = 0; i < 4; ++i) {
13122 assert(WidenedMask[i] >= -1);
13123 if (WidenedMask[i] < 0)
13126 // Make sure all V1 subvectors are in place.
13127 if (WidenedMask[i] < 4) {
13128 if (WidenedMask[i] != i) {
13133 // Make sure we only have a single V2 index and its the lowest 128-bits.
13134 if (V2Index >= 0 || WidenedMask[i] != 4) {
13141 if (IsInsert && V2Index >= 0) {
13142 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13143 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13144 DAG.getIntPtrConstant(0, DL));
13145 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13148 // Try to lower to to vshuf64x2/vshuf32x4.
13149 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13150 unsigned PermMask = 0;
13151 // Insure elements came from the same Op.
13152 for (int i = 0; i < 4; ++i) {
13153 assert(WidenedMask[i] >= -1);
13154 if (WidenedMask[i] < 0)
13157 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13158 unsigned OpIndex = i / 2;
13159 if (Ops[OpIndex].isUndef())
13161 else if (Ops[OpIndex] != Op)
13164 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13165 // bits defined by a vshuf64x2 instruction's immediate control byte.
13166 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13169 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13170 DAG.getConstant(PermMask, DL, MVT::i8));
13173 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13174 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13175 const APInt &Zeroable,
13176 SDValue V1, SDValue V2,
13177 const X86Subtarget &Subtarget,
13178 SelectionDAG &DAG) {
13179 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13180 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13181 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13183 if (V2.isUndef()) {
13184 // Use low duplicate instructions for masks that match their pattern.
13185 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13186 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13188 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13189 // Non-half-crossing single input shuffles can be lowered with an
13190 // interleaved permutation.
13191 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13192 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13193 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13194 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13195 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13196 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13199 SmallVector<int, 4> RepeatedMask;
13200 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13201 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13202 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13205 if (SDValue Shuf128 =
13206 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13209 if (SDValue Unpck =
13210 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13213 // Check if the blend happens to exactly fit that of SHUFPD.
13215 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13218 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13219 V2, DAG, Subtarget))
13222 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13223 Zeroable, Subtarget, DAG))
13226 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13229 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13230 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13231 const APInt &Zeroable,
13232 SDValue V1, SDValue V2,
13233 const X86Subtarget &Subtarget,
13234 SelectionDAG &DAG) {
13235 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13236 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13237 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13239 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13240 // options to efficiently lower the shuffle.
13241 SmallVector<int, 4> RepeatedMask;
13242 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13243 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13245 // Use even/odd duplicate instructions for masks that match their pattern.
13246 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13247 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13248 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13249 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13252 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13253 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13255 // Use dedicated unpack instructions for masks that match their pattern.
13256 if (SDValue Unpck =
13257 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13260 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13261 Zeroable, Subtarget, DAG))
13264 // Otherwise, fall back to a SHUFPS sequence.
13265 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13267 // If we have AVX512F support, we can use VEXPAND.
13268 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13269 V1, V2, DAG, Subtarget))
13272 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13275 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13276 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13277 const APInt &Zeroable,
13278 SDValue V1, SDValue V2,
13279 const X86Subtarget &Subtarget,
13280 SelectionDAG &DAG) {
13281 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13282 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13283 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13285 if (SDValue Shuf128 =
13286 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13289 if (V2.isUndef()) {
13290 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13291 // can use lower latency instructions that will operate on all four
13293 SmallVector<int, 2> Repeated128Mask;
13294 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13295 SmallVector<int, 4> PSHUFDMask;
13296 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13297 return DAG.getBitcast(
13299 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13300 DAG.getBitcast(MVT::v16i32, V1),
13301 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13304 SmallVector<int, 4> Repeated256Mask;
13305 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13306 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13307 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13310 // Try to use shift instructions.
13311 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13312 Zeroable, Subtarget, DAG))
13315 // Try to use VALIGN.
13316 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13317 Mask, Subtarget, DAG))
13320 // Try to use PALIGNR.
13321 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13322 Mask, Subtarget, DAG))
13325 if (SDValue Unpck =
13326 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13328 // If we have AVX512F support, we can use VEXPAND.
13329 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13330 V2, DAG, Subtarget))
13333 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13334 Zeroable, Subtarget, DAG))
13337 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13340 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13341 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13342 const APInt &Zeroable,
13343 SDValue V1, SDValue V2,
13344 const X86Subtarget &Subtarget,
13345 SelectionDAG &DAG) {
13346 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13347 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13348 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13350 // Whenever we can lower this as a zext, that instruction is strictly faster
13351 // than any alternative. It also allows us to fold memory operands into the
13352 // shuffle in many cases.
13353 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13354 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13357 // If the shuffle mask is repeated in each 128-bit lane we can use more
13358 // efficient instructions that mirror the shuffles across the four 128-bit
13360 SmallVector<int, 4> RepeatedMask;
13361 bool Is128BitLaneRepeatedShuffle =
13362 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13363 if (Is128BitLaneRepeatedShuffle) {
13364 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13366 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13367 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13369 // Use dedicated unpack instructions for masks that match their pattern.
13371 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13375 // Try to use shift instructions.
13376 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13377 Zeroable, Subtarget, DAG))
13380 // Try to use VALIGN.
13381 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13382 Mask, Subtarget, DAG))
13385 // Try to use byte rotation instructions.
13386 if (Subtarget.hasBWI())
13387 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13388 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13391 // Assume that a single SHUFPS is faster than using a permv shuffle.
13392 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13393 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13394 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13395 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13396 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13397 CastV1, CastV2, DAG);
13398 return DAG.getBitcast(MVT::v16i32, ShufPS);
13400 // If we have AVX512F support, we can use VEXPAND.
13401 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13402 V1, V2, DAG, Subtarget))
13405 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13406 Zeroable, Subtarget, DAG))
13408 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13411 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13412 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13413 const APInt &Zeroable,
13414 SDValue V1, SDValue V2,
13415 const X86Subtarget &Subtarget,
13416 SelectionDAG &DAG) {
13417 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13418 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13419 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13420 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13422 // Whenever we can lower this as a zext, that instruction is strictly faster
13423 // than any alternative. It also allows us to fold memory operands into the
13424 // shuffle in many cases.
13425 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13426 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13429 // Use dedicated unpack instructions for masks that match their pattern.
13431 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13434 // Try to use shift instructions.
13435 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13436 Zeroable, Subtarget, DAG))
13439 // Try to use byte rotation instructions.
13440 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13441 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13444 if (V2.isUndef()) {
13445 SmallVector<int, 8> RepeatedMask;
13446 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13447 // As this is a single-input shuffle, the repeated mask should be
13448 // a strictly valid v8i16 mask that we can pass through to the v8i16
13449 // lowering to handle even the v32 case.
13450 return lowerV8I16GeneralSingleInputVectorShuffle(
13451 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13455 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13456 Zeroable, Subtarget, DAG))
13459 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13462 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13463 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13464 const APInt &Zeroable,
13465 SDValue V1, SDValue V2,
13466 const X86Subtarget &Subtarget,
13467 SelectionDAG &DAG) {
13468 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13469 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13470 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13471 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13473 // Whenever we can lower this as a zext, that instruction is strictly faster
13474 // than any alternative. It also allows us to fold memory operands into the
13475 // shuffle in many cases.
13476 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13477 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13480 // Use dedicated unpack instructions for masks that match their pattern.
13482 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13485 // Try to use shift instructions.
13486 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13487 Zeroable, Subtarget, DAG))
13490 // Try to use byte rotation instructions.
13491 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13492 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13495 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13496 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13499 // VBMI can use VPERMV/VPERMV3 byte shuffles.
13500 if (Subtarget.hasVBMI())
13501 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13503 // Try to create an in-lane repeating shuffle mask and then shuffle the
13504 // the results into the target lanes.
13505 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13506 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13509 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13510 Zeroable, Subtarget, DAG))
13513 // FIXME: Implement direct support for this type!
13514 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13517 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13519 /// This routine either breaks down the specific type of a 512-bit x86 vector
13520 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13521 /// together based on the available instructions.
13522 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13523 MVT VT, SDValue V1, SDValue V2,
13524 const APInt &Zeroable,
13525 const X86Subtarget &Subtarget,
13526 SelectionDAG &DAG) {
13527 assert(Subtarget.hasAVX512() &&
13528 "Cannot lower 512-bit vectors w/ basic ISA!");
13530 // If we have a single input to the zero element, insert that into V1 if we
13531 // can do so cheaply.
13532 int NumElts = Mask.size();
13533 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13535 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13536 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13537 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13540 // Check for being able to broadcast a single element.
13541 if (SDValue Broadcast =
13542 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13545 // Dispatch to each element type for lowering. If we don't have support for
13546 // specific element type shuffles at 512 bits, immediately split them and
13547 // lower them. Each lowering routine of a given type is allowed to assume that
13548 // the requisite ISA extensions for that element type are available.
13549 switch (VT.SimpleTy) {
13551 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13553 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13555 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13557 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13559 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13561 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13564 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13568 // Lower vXi1 vector shuffles.
13569 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13570 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13571 // vector, shuffle and then truncate it back.
13572 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13573 MVT VT, SDValue V1, SDValue V2,
13574 const X86Subtarget &Subtarget,
13575 SelectionDAG &DAG) {
13576 assert(Subtarget.hasAVX512() &&
13577 "Cannot lower 512-bit vectors w/o basic ISA!");
13579 switch (VT.SimpleTy) {
13581 llvm_unreachable("Expected a vector of i1 elements");
13583 ExtVT = MVT::v2i64;
13586 ExtVT = MVT::v4i32;
13589 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13592 ExtVT = MVT::v16i32;
13595 ExtVT = MVT::v32i16;
13598 ExtVT = MVT::v64i8;
13602 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13603 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13604 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13605 V1 = getOnesVector(ExtVT, DAG, DL);
13607 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13610 V2 = DAG.getUNDEF(ExtVT);
13611 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13612 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13613 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13614 V2 = getOnesVector(ExtVT, DAG, DL);
13616 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13618 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13619 // i1 was sign extended we can use X86ISD::CVT2MASK.
13620 int NumElems = VT.getVectorNumElements();
13621 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13622 (Subtarget.hasDQI() && (NumElems < 32)))
13623 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13625 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13628 /// Helper function that returns true if the shuffle mask should be
13629 /// commuted to improve canonicalization.
13630 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13631 int NumElements = Mask.size();
13633 int NumV1Elements = 0, NumV2Elements = 0;
13637 else if (M < NumElements)
13642 // Commute the shuffle as needed such that more elements come from V1 than
13643 // V2. This allows us to match the shuffle pattern strictly on how many
13644 // elements come from V1 without handling the symmetric cases.
13645 if (NumV2Elements > NumV1Elements)
13648 assert(NumV1Elements > 0 && "No V1 indices");
13650 if (NumV2Elements == 0)
13653 // When the number of V1 and V2 elements are the same, try to minimize the
13654 // number of uses of V2 in the low half of the vector. When that is tied,
13655 // ensure that the sum of indices for V1 is equal to or lower than the sum
13656 // indices for V2. When those are equal, try to ensure that the number of odd
13657 // indices for V1 is lower than the number of odd indices for V2.
13658 if (NumV1Elements == NumV2Elements) {
13659 int LowV1Elements = 0, LowV2Elements = 0;
13660 for (int M : Mask.slice(0, NumElements / 2))
13661 if (M >= NumElements)
13665 if (LowV2Elements > LowV1Elements)
13667 if (LowV2Elements == LowV1Elements) {
13668 int SumV1Indices = 0, SumV2Indices = 0;
13669 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13670 if (Mask[i] >= NumElements)
13672 else if (Mask[i] >= 0)
13674 if (SumV2Indices < SumV1Indices)
13676 if (SumV2Indices == SumV1Indices) {
13677 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13678 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13679 if (Mask[i] >= NumElements)
13680 NumV2OddIndices += i % 2;
13681 else if (Mask[i] >= 0)
13682 NumV1OddIndices += i % 2;
13683 if (NumV2OddIndices < NumV1OddIndices)
13692 /// \brief Top-level lowering for x86 vector shuffles.
13694 /// This handles decomposition, canonicalization, and lowering of all x86
13695 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13696 /// above in helper routines. The canonicalization attempts to widen shuffles
13697 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13698 /// s.t. only one of the two inputs needs to be tested, etc.
13699 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13700 SelectionDAG &DAG) {
13701 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13702 ArrayRef<int> Mask = SVOp->getMask();
13703 SDValue V1 = Op.getOperand(0);
13704 SDValue V2 = Op.getOperand(1);
13705 MVT VT = Op.getSimpleValueType();
13706 int NumElements = VT.getVectorNumElements();
13708 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13710 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13711 "Can't lower MMX shuffles");
13713 bool V1IsUndef = V1.isUndef();
13714 bool V2IsUndef = V2.isUndef();
13715 if (V1IsUndef && V2IsUndef)
13716 return DAG.getUNDEF(VT);
13718 // When we create a shuffle node we put the UNDEF node to second operand,
13719 // but in some cases the first operand may be transformed to UNDEF.
13720 // In this case we should just commute the node.
13722 return DAG.getCommutedVectorShuffle(*SVOp);
13724 // Check for non-undef masks pointing at an undef vector and make the masks
13725 // undef as well. This makes it easier to match the shuffle based solely on
13729 if (M >= NumElements) {
13730 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13731 for (int &M : NewMask)
13732 if (M >= NumElements)
13734 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13737 // Check for illegal shuffle mask element index values.
13738 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13739 assert(llvm::all_of(Mask,
13740 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13741 "Out of bounds shuffle index");
13743 // We actually see shuffles that are entirely re-arrangements of a set of
13744 // zero inputs. This mostly happens while decomposing complex shuffles into
13745 // simple ones. Directly lower these as a buildvector of zeros.
13746 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13747 if (Zeroable.isAllOnesValue())
13748 return getZeroVector(VT, Subtarget, DAG, DL);
13750 // Try to collapse shuffles into using a vector type with fewer elements but
13751 // wider element types. We cap this to not form integers or floating point
13752 // elements wider than 64 bits, but it might be interesting to form i128
13753 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13754 SmallVector<int, 16> WidenedMask;
13755 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13756 canWidenShuffleElements(Mask, WidenedMask)) {
13757 MVT NewEltVT = VT.isFloatingPoint()
13758 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13759 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13760 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13761 // Make sure that the new vector type is legal. For example, v2f64 isn't
13763 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13764 V1 = DAG.getBitcast(NewVT, V1);
13765 V2 = DAG.getBitcast(NewVT, V2);
13766 return DAG.getBitcast(
13767 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13771 // Commute the shuffle if it will improve canonicalization.
13772 if (canonicalizeShuffleMaskWithCommute(Mask))
13773 return DAG.getCommutedVectorShuffle(*SVOp);
13775 // For each vector width, delegate to a specialized lowering routine.
13776 if (VT.is128BitVector())
13777 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13780 if (VT.is256BitVector())
13781 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13784 if (VT.is512BitVector())
13785 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13789 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13791 llvm_unreachable("Unimplemented!");
13794 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13795 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13796 const X86Subtarget &Subtarget,
13797 SelectionDAG &DAG) {
13798 SDValue Cond = Op.getOperand(0);
13799 SDValue LHS = Op.getOperand(1);
13800 SDValue RHS = Op.getOperand(2);
13802 MVT VT = Op.getSimpleValueType();
13804 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13806 auto *CondBV = cast<BuildVectorSDNode>(Cond);
13808 // Only non-legal VSELECTs reach this lowering, convert those into generic
13809 // shuffles and re-use the shuffle lowering path for blends.
13810 SmallVector<int, 32> Mask;
13811 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13812 SDValue CondElt = CondBV->getOperand(i);
13814 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13817 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13820 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13821 // A vselect where all conditions and data are constants can be optimized into
13822 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13823 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13824 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13825 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13828 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
13829 // with patterns on the mask registers on AVX-512.
13830 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
13833 // Try to lower this to a blend-style vector shuffle. This can handle all
13834 // constant condition cases.
13835 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13838 // Variable blends are only legal from SSE4.1 onward.
13839 if (!Subtarget.hasSSE41())
13843 MVT VT = Op.getSimpleValueType();
13845 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
13846 // into an i1 condition so that we can use the mask-based 512-bit blend
13848 if (VT.getSizeInBits() == 512) {
13849 SDValue Cond = Op.getOperand(0);
13850 // The vNi1 condition case should be handled above as it can be trivially
13852 assert(Cond.getValueType().getScalarSizeInBits() ==
13853 VT.getScalarSizeInBits() &&
13854 "Should have a size-matched integer condition!");
13855 // Build a mask by testing the condition against itself (tests for zero).
13856 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
13857 SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
13858 // Now return a new VSELECT using the mask.
13859 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
13862 // Only some types will be legal on some subtargets. If we can emit a legal
13863 // VSELECT-matching blend, return Op, and but if we need to expand, return
13865 switch (VT.SimpleTy) {
13867 // Most of the vector types have blends past SSE4.1.
13871 // The byte blends for AVX vectors were introduced only in AVX2.
13872 if (Subtarget.hasAVX2())
13879 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
13880 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13883 // FIXME: We should custom lower this by fixing the condition and using i8
13889 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13890 MVT VT = Op.getSimpleValueType();
13893 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13896 if (VT.getSizeInBits() == 8) {
13897 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13898 Op.getOperand(0), Op.getOperand(1));
13899 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13900 DAG.getValueType(VT));
13901 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13904 if (VT == MVT::f32) {
13905 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13906 // the result back to FR32 register. It's only worth matching if the
13907 // result has a single use which is a store or a bitcast to i32. And in
13908 // the case of a store, it's not worth it if the index is a constant 0,
13909 // because a MOVSSmr can be used instead, which is smaller and faster.
13910 if (!Op.hasOneUse())
13912 SDNode *User = *Op.getNode()->use_begin();
13913 if ((User->getOpcode() != ISD::STORE ||
13914 isNullConstant(Op.getOperand(1))) &&
13915 (User->getOpcode() != ISD::BITCAST ||
13916 User->getValueType(0) != MVT::i32))
13918 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13919 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
13921 return DAG.getBitcast(MVT::f32, Extract);
13924 if (VT == MVT::i32 || VT == MVT::i64) {
13925 // ExtractPS/pextrq works with constant index.
13926 if (isa<ConstantSDNode>(Op.getOperand(1)))
13933 /// Extract one bit from mask vector, like v16i1 or v8i1.
13934 /// AVX-512 feature.
13936 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13937 SDValue Vec = Op.getOperand(0);
13939 MVT VecVT = Vec.getSimpleValueType();
13940 SDValue Idx = Op.getOperand(1);
13941 MVT EltVT = Op.getSimpleValueType();
13943 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
13944 "Unexpected vector type in ExtractBitFromMaskVector");
13946 // variable index can't be handled in mask registers,
13947 // extend vector to VR512/128
13948 if (!isa<ConstantSDNode>(Idx)) {
13949 unsigned NumElts = VecVT.getVectorNumElements();
13950 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
13951 // than extending to 128/256bit.
13952 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
13953 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
13954 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
13955 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13956 ExtVT.getVectorElementType(), Ext, Idx);
13957 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13960 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13961 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
13962 (VecVT.getVectorNumElements() < 8)) {
13963 // Use kshiftlw/rw instruction.
13964 VecVT = MVT::v16i1;
13965 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
13966 DAG.getUNDEF(VecVT),
13968 DAG.getIntPtrConstant(0, dl));
13970 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
13971 if (MaxSift - IdxVal)
13972 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
13973 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
13974 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
13975 DAG.getConstant(MaxSift, dl, MVT::i8));
13976 return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
13977 DAG.getIntPtrConstant(0, dl));
13981 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13982 SelectionDAG &DAG) const {
13984 SDValue Vec = Op.getOperand(0);
13985 MVT VecVT = Vec.getSimpleValueType();
13986 SDValue Idx = Op.getOperand(1);
13988 if (VecVT.getVectorElementType() == MVT::i1)
13989 return ExtractBitFromMaskVector(Op, DAG);
13991 if (!isa<ConstantSDNode>(Idx)) {
13992 // Its more profitable to go through memory (1 cycles throughput)
13993 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
13994 // IACA tool was used to get performance estimation
13995 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
13997 // example : extractelement <16 x i8> %a, i32 %i
13999 // Block Throughput: 3.00 Cycles
14000 // Throughput Bottleneck: Port5
14002 // | Num Of | Ports pressure in cycles | |
14003 // | Uops | 0 - DV | 5 | 6 | 7 | |
14004 // ---------------------------------------------
14005 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14006 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14007 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14008 // Total Num Of Uops: 4
14011 // Block Throughput: 1.00 Cycles
14012 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14014 // | | Ports pressure in cycles | |
14015 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14016 // ---------------------------------------------------------
14017 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14018 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14019 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14020 // Total Num Of Uops: 4
14025 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14027 // If this is a 256-bit vector result, first extract the 128-bit vector and
14028 // then extract the element from the 128-bit vector.
14029 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14030 // Get the 128-bit vector.
14031 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14032 MVT EltVT = VecVT.getVectorElementType();
14034 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14035 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14037 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14038 // this can be done with a mask.
14039 IdxVal &= ElemsPerChunk - 1;
14040 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14041 DAG.getConstant(IdxVal, dl, MVT::i32));
14044 assert(VecVT.is128BitVector() && "Unexpected vector length");
14046 MVT VT = Op.getSimpleValueType();
14048 if (VT.getSizeInBits() == 16) {
14049 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14050 // we're going to zero extend the register or fold the store (SSE41 only).
14051 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14052 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14053 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14054 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14055 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14057 // Transform it so it match pextrw which produces a 32-bit result.
14058 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14059 Op.getOperand(0), Op.getOperand(1));
14060 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14061 DAG.getValueType(VT));
14062 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14065 if (Subtarget.hasSSE41())
14066 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14069 // TODO: We only extract a single element from v16i8, we can probably afford
14070 // to be more aggressive here before using the default approach of spilling to
14072 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14073 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14074 int DWordIdx = IdxVal / 4;
14075 if (DWordIdx == 0) {
14076 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14077 DAG.getBitcast(MVT::v4i32, Vec),
14078 DAG.getIntPtrConstant(DWordIdx, dl));
14079 int ShiftVal = (IdxVal % 4) * 8;
14081 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14082 DAG.getConstant(ShiftVal, dl, MVT::i32));
14083 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14086 int WordIdx = IdxVal / 2;
14087 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14088 DAG.getBitcast(MVT::v8i16, Vec),
14089 DAG.getIntPtrConstant(WordIdx, dl));
14090 int ShiftVal = (IdxVal % 2) * 8;
14092 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14093 DAG.getConstant(ShiftVal, dl, MVT::i16));
14094 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14097 if (VT.getSizeInBits() == 32) {
14101 // SHUFPS the element to the lowest double word, then movss.
14102 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14103 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14104 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14105 DAG.getIntPtrConstant(0, dl));
14108 if (VT.getSizeInBits() == 64) {
14109 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14110 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14111 // to match extract_elt for f64.
14115 // UNPCKHPD the element to the lowest double word, then movsd.
14116 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14117 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14118 int Mask[2] = { 1, -1 };
14119 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14120 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14121 DAG.getIntPtrConstant(0, dl));
14127 /// Insert one bit to mask vector, like v16i1 or v8i1.
14128 /// AVX-512 feature.
14130 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14132 SDValue Vec = Op.getOperand(0);
14133 SDValue Elt = Op.getOperand(1);
14134 SDValue Idx = Op.getOperand(2);
14135 MVT VecVT = Vec.getSimpleValueType();
14137 if (!isa<ConstantSDNode>(Idx)) {
14138 // Non constant index. Extend source and destination,
14139 // insert element and then truncate the result.
14140 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
14141 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
14142 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14143 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14144 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14145 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14148 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14149 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14150 unsigned NumElems = VecVT.getVectorNumElements();
14152 if(Vec.isUndef()) {
14154 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14155 DAG.getConstant(IdxVal, dl, MVT::i8));
14159 // Insertion of one bit into first position
14160 if (IdxVal == 0 ) {
14161 // Clean top bits of vector.
14162 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14163 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14164 EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14165 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14166 // Clean the first bit in source vector.
14167 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14168 DAG.getConstant(1 , dl, MVT::i8));
14169 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14170 DAG.getConstant(1, dl, MVT::i8));
14172 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14174 // Insertion of one bit into last position
14175 if (IdxVal == NumElems -1) {
14176 // Move the bit to the last position inside the vector.
14177 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14178 DAG.getConstant(IdxVal, dl, MVT::i8));
14179 // Clean the last bit in the source vector.
14180 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14181 DAG.getConstant(1, dl, MVT::i8));
14182 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14183 DAG.getConstant(1 , dl, MVT::i8));
14185 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14188 // Use shuffle to insert element.
14189 SmallVector<int, 64> MaskVec(NumElems);
14190 for (unsigned i = 0; i != NumElems; ++i)
14191 MaskVec[i] = (i == IdxVal) ? NumElems : i;
14193 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14196 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14197 SelectionDAG &DAG) const {
14198 MVT VT = Op.getSimpleValueType();
14199 MVT EltVT = VT.getVectorElementType();
14200 unsigned NumElts = VT.getVectorNumElements();
14202 if (EltVT == MVT::i1)
14203 return InsertBitToMaskVector(Op, DAG);
14206 SDValue N0 = Op.getOperand(0);
14207 SDValue N1 = Op.getOperand(1);
14208 SDValue N2 = Op.getOperand(2);
14209 if (!isa<ConstantSDNode>(N2))
14211 auto *N2C = cast<ConstantSDNode>(N2);
14212 unsigned IdxVal = N2C->getZExtValue();
14214 bool IsZeroElt = X86::isZeroNode(N1);
14215 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14217 // If we are inserting a element, see if we can do this more efficiently with
14218 // a blend shuffle with a rematerializable vector than a costly integer
14220 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
14221 // be beneficial if we are inserting several zeros and can combine the masks.
14222 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
14223 SmallVector<int, 8> BlendMask;
14224 for (unsigned i = 0; i != NumElts; ++i)
14225 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14226 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14227 : DAG.getConstant(-1, dl, VT);
14228 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14231 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14232 // into that, and then insert the subvector back into the result.
14233 if (VT.is256BitVector() || VT.is512BitVector()) {
14234 // With a 256-bit vector, we can insert into the zero element efficiently
14235 // using a blend if we have AVX or AVX2 and the right data type.
14236 if (VT.is256BitVector() && IdxVal == 0) {
14237 // TODO: It is worthwhile to cast integer to floating point and back
14238 // and incur a domain crossing penalty if that's what we'll end up
14239 // doing anyway after extracting to a 128-bit vector.
14240 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14241 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14242 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14243 N2 = DAG.getIntPtrConstant(1, dl);
14244 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14248 // Get the desired 128-bit vector chunk.
14249 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14251 // Insert the element into the desired chunk.
14252 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14253 assert(isPowerOf2_32(NumEltsIn128));
14254 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14255 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14257 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14258 DAG.getConstant(IdxIn128, dl, MVT::i32));
14260 // Insert the changed part back into the bigger vector
14261 return insert128BitVector(N0, V, IdxVal, DAG, dl);
14263 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14265 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14266 // argument. SSE41 required for pinsrb.
14267 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14269 if (VT == MVT::v8i16) {
14270 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14271 Opc = X86ISD::PINSRW;
14273 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14274 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14275 Opc = X86ISD::PINSRB;
14278 if (N1.getValueType() != MVT::i32)
14279 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14280 if (N2.getValueType() != MVT::i32)
14281 N2 = DAG.getIntPtrConstant(IdxVal, dl);
14282 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14285 if (Subtarget.hasSSE41()) {
14286 if (EltVT == MVT::f32) {
14287 // Bits [7:6] of the constant are the source select. This will always be
14288 // zero here. The DAG Combiner may combine an extract_elt index into
14289 // these bits. For example (insert (extract, 3), 2) could be matched by
14290 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14291 // Bits [5:4] of the constant are the destination select. This is the
14292 // value of the incoming immediate.
14293 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14294 // combine either bitwise AND or insert of float 0.0 to set these bits.
14296 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14297 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14298 // If this is an insertion of 32-bits into the low 32-bits of
14299 // a vector, we prefer to generate a blend with immediate rather
14300 // than an insertps. Blends are simpler operations in hardware and so
14301 // will always have equal or better performance than insertps.
14302 // But if optimizing for size and there's a load folding opportunity,
14303 // generate insertps because blendps does not have a 32-bit memory
14305 N2 = DAG.getIntPtrConstant(1, dl);
14306 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14307 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14309 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14310 // Create this as a scalar to vector..
14311 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14312 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14315 // PINSR* works with constant index.
14316 if (EltVT == MVT::i32 || EltVT == MVT::i64)
14323 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14324 SelectionDAG &DAG) {
14326 MVT OpVT = Op.getSimpleValueType();
14328 // It's always cheaper to replace a xor+movd with xorps and simplifies further
14330 if (X86::isZeroNode(Op.getOperand(0)))
14331 return getZeroVector(OpVT, Subtarget, DAG, dl);
14333 // If this is a 256-bit vector result, first insert into a 128-bit
14334 // vector and then insert into the 256-bit vector.
14335 if (!OpVT.is128BitVector()) {
14336 // Insert into a 128-bit vector.
14337 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14338 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14339 OpVT.getVectorNumElements() / SizeFactor);
14341 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14343 // Insert the 128-bit vector.
14344 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14346 assert(OpVT.is128BitVector() && "Expected an SSE type!");
14348 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14349 if (OpVT == MVT::v4i32)
14352 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14353 return DAG.getBitcast(
14354 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14357 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
14358 // a simple subregister reference or explicit instructions to grab
14359 // upper bits of a vector.
14360 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14361 SelectionDAG &DAG) {
14362 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
14365 SDValue In = Op.getOperand(0);
14366 SDValue Idx = Op.getOperand(1);
14367 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14368 MVT ResVT = Op.getSimpleValueType();
14370 assert((In.getSimpleValueType().is256BitVector() ||
14371 In.getSimpleValueType().is512BitVector()) &&
14372 "Can only extract from 256-bit or 512-bit vectors");
14374 // If the input is a buildvector just emit a smaller one.
14375 unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14376 if (In.getOpcode() == ISD::BUILD_VECTOR)
14377 return DAG.getBuildVector(
14378 ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14380 // Everything else is legal.
14384 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
14385 // simple superregister reference or explicit instructions to insert
14386 // the upper bits of a vector.
14387 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14388 SelectionDAG &DAG) {
14389 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14391 return insert1BitVector(Op, DAG, Subtarget);
14394 // Returns the appropriate wrapper opcode for a global reference.
14395 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14396 // References to absolute symbols are never PC-relative.
14397 if (GV && GV->isAbsoluteSymbolRef())
14398 return X86ISD::Wrapper;
14400 CodeModel::Model M = getTargetMachine().getCodeModel();
14401 if (Subtarget.isPICStyleRIPRel() &&
14402 (M == CodeModel::Small || M == CodeModel::Kernel))
14403 return X86ISD::WrapperRIP;
14405 return X86ISD::Wrapper;
14408 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14409 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14410 // one of the above mentioned nodes. It has to be wrapped because otherwise
14411 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14412 // be used to form addressing mode. These wrapped nodes will be selected
14415 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14416 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14418 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14419 // global base reg.
14420 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14422 auto PtrVT = getPointerTy(DAG.getDataLayout());
14423 SDValue Result = DAG.getTargetConstantPool(
14424 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14426 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14427 // With PIC, the address is actually $g + Offset.
14430 DAG.getNode(ISD::ADD, DL, PtrVT,
14431 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14437 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14438 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14440 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14441 // global base reg.
14442 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14444 auto PtrVT = getPointerTy(DAG.getDataLayout());
14445 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14447 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14449 // With PIC, the address is actually $g + Offset.
14452 DAG.getNode(ISD::ADD, DL, PtrVT,
14453 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14459 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14460 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14462 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14463 // global base reg.
14464 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14465 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14467 auto PtrVT = getPointerTy(DAG.getDataLayout());
14468 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14471 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14473 // With PIC, the address is actually $g + Offset.
14474 if (isPositionIndependent() && !Subtarget.is64Bit()) {
14476 DAG.getNode(ISD::ADD, DL, PtrVT,
14477 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14480 // For symbols that require a load from a stub to get the address, emit the
14482 if (isGlobalStubReference(OpFlag))
14483 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14484 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14490 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14491 // Create the TargetBlockAddressAddress node.
14492 unsigned char OpFlags =
14493 Subtarget.classifyBlockAddressReference();
14494 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14495 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14497 auto PtrVT = getPointerTy(DAG.getDataLayout());
14498 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14499 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14501 // With PIC, the address is actually $g + Offset.
14502 if (isGlobalRelativeToPICBase(OpFlags)) {
14503 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14504 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14510 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14511 const SDLoc &dl, int64_t Offset,
14512 SelectionDAG &DAG) const {
14513 // Create the TargetGlobalAddress node, folding in the constant
14514 // offset if it is legal.
14515 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14516 CodeModel::Model M = DAG.getTarget().getCodeModel();
14517 auto PtrVT = getPointerTy(DAG.getDataLayout());
14519 if (OpFlags == X86II::MO_NO_FLAG &&
14520 X86::isOffsetSuitableForCodeModel(Offset, M)) {
14521 // A direct static reference to a global.
14522 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14525 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14528 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14530 // With PIC, the address is actually $g + Offset.
14531 if (isGlobalRelativeToPICBase(OpFlags)) {
14532 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14533 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14536 // For globals that require a load from a stub to get the address, emit the
14538 if (isGlobalStubReference(OpFlags))
14539 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14540 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14542 // If there was a non-zero offset that we didn't fold, create an explicit
14543 // addition for it.
14545 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14546 DAG.getConstant(Offset, dl, PtrVT));
14552 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14553 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14554 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14555 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14559 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14560 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14561 unsigned char OperandFlags, bool LocalDynamic = false) {
14562 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14563 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14565 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14566 GA->getValueType(0),
14570 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14574 SDValue Ops[] = { Chain, TGA, *InFlag };
14575 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14577 SDValue Ops[] = { Chain, TGA };
14578 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14581 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14582 MFI.setAdjustsStack(true);
14583 MFI.setHasCalls(true);
14585 SDValue Flag = Chain.getValue(1);
14586 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14589 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14591 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14594 SDLoc dl(GA); // ? function entry point might be better
14595 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14596 DAG.getNode(X86ISD::GlobalBaseReg,
14597 SDLoc(), PtrVT), InFlag);
14598 InFlag = Chain.getValue(1);
14600 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14603 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14605 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14607 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14608 X86::RAX, X86II::MO_TLSGD);
14611 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14617 // Get the start address of the TLS block for this module.
14618 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14619 .getInfo<X86MachineFunctionInfo>();
14620 MFI->incNumLocalDynamicTLSAccesses();
14624 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14625 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14628 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14629 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14630 InFlag = Chain.getValue(1);
14631 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14632 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14635 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14639 unsigned char OperandFlags = X86II::MO_DTPOFF;
14640 unsigned WrapperKind = X86ISD::Wrapper;
14641 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14642 GA->getValueType(0),
14643 GA->getOffset(), OperandFlags);
14644 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14646 // Add x@dtpoff with the base.
14647 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14650 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14651 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14652 const EVT PtrVT, TLSModel::Model model,
14653 bool is64Bit, bool isPIC) {
14656 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14657 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14658 is64Bit ? 257 : 256));
14660 SDValue ThreadPointer =
14661 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14662 MachinePointerInfo(Ptr));
14664 unsigned char OperandFlags = 0;
14665 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14667 unsigned WrapperKind = X86ISD::Wrapper;
14668 if (model == TLSModel::LocalExec) {
14669 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14670 } else if (model == TLSModel::InitialExec) {
14672 OperandFlags = X86II::MO_GOTTPOFF;
14673 WrapperKind = X86ISD::WrapperRIP;
14675 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14678 llvm_unreachable("Unexpected model");
14681 // emit "addl x@ntpoff,%eax" (local exec)
14682 // or "addl x@indntpoff,%eax" (initial exec)
14683 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14685 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14686 GA->getOffset(), OperandFlags);
14687 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14689 if (model == TLSModel::InitialExec) {
14690 if (isPIC && !is64Bit) {
14691 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14692 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14696 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14697 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14700 // The address of the thread local variable is the add of the thread
14701 // pointer with the offset of the variable.
14702 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14706 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14708 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14710 if (DAG.getTarget().Options.EmulatedTLS)
14711 return LowerToTLSEmulatedModel(GA, DAG);
14713 const GlobalValue *GV = GA->getGlobal();
14714 auto PtrVT = getPointerTy(DAG.getDataLayout());
14715 bool PositionIndependent = isPositionIndependent();
14717 if (Subtarget.isTargetELF()) {
14718 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14720 case TLSModel::GeneralDynamic:
14721 if (Subtarget.is64Bit())
14722 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14723 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14724 case TLSModel::LocalDynamic:
14725 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14726 Subtarget.is64Bit());
14727 case TLSModel::InitialExec:
14728 case TLSModel::LocalExec:
14729 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14730 PositionIndependent);
14732 llvm_unreachable("Unknown TLS model.");
14735 if (Subtarget.isTargetDarwin()) {
14736 // Darwin only has one model of TLS. Lower to that.
14737 unsigned char OpFlag = 0;
14738 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14739 X86ISD::WrapperRIP : X86ISD::Wrapper;
14741 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14742 // global base reg.
14743 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14745 OpFlag = X86II::MO_TLVP_PIC_BASE;
14747 OpFlag = X86II::MO_TLVP;
14749 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14750 GA->getValueType(0),
14751 GA->getOffset(), OpFlag);
14752 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14754 // With PIC32, the address is actually $g + Offset.
14756 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14757 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14760 // Lowering the machine isd will make sure everything is in the right
14762 SDValue Chain = DAG.getEntryNode();
14763 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14764 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
14765 SDValue Args[] = { Chain, Offset };
14766 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14767 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14768 DAG.getIntPtrConstant(0, DL, true),
14769 Chain.getValue(1), DL);
14771 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14772 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14773 MFI.setAdjustsStack(true);
14775 // And our return value (tls address) is in the standard call return value
14777 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14778 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14781 if (Subtarget.isTargetKnownWindowsMSVC() ||
14782 Subtarget.isTargetWindowsItanium() ||
14783 Subtarget.isTargetWindowsGNU()) {
14784 // Just use the implicit TLS architecture
14785 // Need to generate something similar to:
14786 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14788 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14789 // mov rcx, qword [rdx+rcx*8]
14790 // mov eax, .tls$:tlsvar
14791 // [rax+rcx] contains the address
14792 // Windows 64bit: gs:0x58
14793 // Windows 32bit: fs:__tls_array
14796 SDValue Chain = DAG.getEntryNode();
14798 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14799 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14800 // use its literal value of 0x2C.
14801 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14802 ? Type::getInt8PtrTy(*DAG.getContext(),
14804 : Type::getInt32PtrTy(*DAG.getContext(),
14807 SDValue TlsArray = Subtarget.is64Bit()
14808 ? DAG.getIntPtrConstant(0x58, dl)
14809 : (Subtarget.isTargetWindowsGNU()
14810 ? DAG.getIntPtrConstant(0x2C, dl)
14811 : DAG.getExternalSymbol("_tls_array", PtrVT));
14813 SDValue ThreadPointer =
14814 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14817 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14818 res = ThreadPointer;
14820 // Load the _tls_index variable
14821 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14822 if (Subtarget.is64Bit())
14823 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14824 MachinePointerInfo(), MVT::i32);
14826 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14828 auto &DL = DAG.getDataLayout();
14830 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14831 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14833 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14836 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14838 // Get the offset of start of .tls section
14839 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14840 GA->getValueType(0),
14841 GA->getOffset(), X86II::MO_SECREL);
14842 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14844 // The address of the thread local variable is the add of the thread
14845 // pointer with the offset of the variable.
14846 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14849 llvm_unreachable("TLS not implemented for this target.");
14852 /// Lower SRA_PARTS and friends, which return two i32 values
14853 /// and take a 2 x i32 value to shift plus a shift amount.
14854 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14855 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14856 MVT VT = Op.getSimpleValueType();
14857 unsigned VTBits = VT.getSizeInBits();
14859 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14860 SDValue ShOpLo = Op.getOperand(0);
14861 SDValue ShOpHi = Op.getOperand(1);
14862 SDValue ShAmt = Op.getOperand(2);
14863 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14864 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14866 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14867 DAG.getConstant(VTBits - 1, dl, MVT::i8));
14868 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14869 DAG.getConstant(VTBits - 1, dl, MVT::i8))
14870 : DAG.getConstant(0, dl, VT);
14872 SDValue Tmp2, Tmp3;
14873 if (Op.getOpcode() == ISD::SHL_PARTS) {
14874 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14875 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14877 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14878 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14881 // If the shift amount is larger or equal than the width of a part we can't
14882 // rely on the results of shld/shrd. Insert a test and select the appropriate
14883 // values for large shift amounts.
14884 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14885 DAG.getConstant(VTBits, dl, MVT::i8));
14886 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14887 AndNode, DAG.getConstant(0, dl, MVT::i8));
14890 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14891 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14892 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14894 if (Op.getOpcode() == ISD::SHL_PARTS) {
14895 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14896 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14898 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14899 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14902 SDValue Ops[2] = { Lo, Hi };
14903 return DAG.getMergeValues(Ops, dl);
14906 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14907 SelectionDAG &DAG) const {
14908 SDValue Src = Op.getOperand(0);
14909 MVT SrcVT = Src.getSimpleValueType();
14910 MVT VT = Op.getSimpleValueType();
14913 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14914 if (SrcVT.isVector()) {
14915 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
14916 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
14917 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
14918 DAG.getUNDEF(SrcVT)));
14920 if (SrcVT.getVectorElementType() == MVT::i1) {
14921 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
14922 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14923 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
14924 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14925 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14926 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
14931 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14932 "Unknown SINT_TO_FP to lower!");
14934 // These are really Legal; return the operand so the caller accepts it as
14936 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14938 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14939 Subtarget.is64Bit()) {
14943 SDValue ValueToStore = Op.getOperand(0);
14944 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14945 !Subtarget.is64Bit())
14946 // Bitcasting to f64 here allows us to do a single 64-bit store from
14947 // an SSE register, avoiding the store forwarding penalty that would come
14948 // with two 32-bit stores.
14949 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14951 unsigned Size = SrcVT.getSizeInBits()/8;
14952 MachineFunction &MF = DAG.getMachineFunction();
14953 auto PtrVT = getPointerTy(MF.getDataLayout());
14954 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
14955 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14956 SDValue Chain = DAG.getStore(
14957 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14958 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14959 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14962 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14964 SelectionDAG &DAG) const {
14968 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14970 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14972 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14974 unsigned ByteSize = SrcVT.getSizeInBits()/8;
14976 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14977 MachineMemOperand *MMO;
14979 int SSFI = FI->getIndex();
14980 MMO = DAG.getMachineFunction().getMachineMemOperand(
14981 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14982 MachineMemOperand::MOLoad, ByteSize, ByteSize);
14984 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14985 StackSlot = StackSlot.getOperand(1);
14987 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14988 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14990 Tys, Ops, SrcVT, MMO);
14993 Chain = Result.getValue(1);
14994 SDValue InFlag = Result.getValue(2);
14996 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14997 // shouldn't be necessary except that RFP cannot be live across
14998 // multiple blocks. When stackifier is fixed, they can be uncoupled.
14999 MachineFunction &MF = DAG.getMachineFunction();
15000 unsigned SSFISize = Op.getValueSizeInBits()/8;
15001 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15002 auto PtrVT = getPointerTy(MF.getDataLayout());
15003 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15004 Tys = DAG.getVTList(MVT::Other);
15006 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15008 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15009 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15010 MachineMemOperand::MOStore, SSFISize, SSFISize);
15012 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15013 Ops, Op.getValueType(), MMO);
15014 Result = DAG.getLoad(
15015 Op.getValueType(), DL, Chain, StackSlot,
15016 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15022 /// 64-bit unsigned integer to double expansion.
15023 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15024 SelectionDAG &DAG) const {
15025 // This algorithm is not obvious. Here it is what we're trying to output:
15028 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15029 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15031 haddpd %xmm0, %xmm0
15033 pshufd $0x4e, %xmm0, %xmm1
15039 LLVMContext *Context = DAG.getContext();
15041 // Build some magic constants.
15042 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15043 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15044 auto PtrVT = getPointerTy(DAG.getDataLayout());
15045 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15047 SmallVector<Constant*,2> CV1;
15049 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15050 APInt(64, 0x4330000000000000ULL))));
15052 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15053 APInt(64, 0x4530000000000000ULL))));
15054 Constant *C1 = ConstantVector::get(CV1);
15055 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15057 // Load the 64-bit value into an XMM register.
15058 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15061 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15062 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15063 /* Alignment = */ 16);
15065 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15068 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15069 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15070 /* Alignment = */ 16);
15071 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15072 // TODO: Are there any fast-math-flags to propagate here?
15073 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15076 if (Subtarget.hasSSE3()) {
15077 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15078 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15080 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15081 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15082 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15083 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15086 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15087 DAG.getIntPtrConstant(0, dl));
15090 /// 32-bit unsigned integer to float expansion.
15091 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15092 SelectionDAG &DAG) const {
15094 // FP constant to bias correct the final result.
15095 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15098 // Load the 32-bit value into an XMM register.
15099 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15102 // Zero out the upper parts of the register.
15103 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15105 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15106 DAG.getBitcast(MVT::v2f64, Load),
15107 DAG.getIntPtrConstant(0, dl));
15109 // Or the load with the bias.
15110 SDValue Or = DAG.getNode(
15111 ISD::OR, dl, MVT::v2i64,
15112 DAG.getBitcast(MVT::v2i64,
15113 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15114 DAG.getBitcast(MVT::v2i64,
15115 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15117 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15118 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15120 // Subtract the bias.
15121 // TODO: Are there any fast-math-flags to propagate here?
15122 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15124 // Handle final rounding.
15125 MVT DestVT = Op.getSimpleValueType();
15127 if (DestVT.bitsLT(MVT::f64))
15128 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15129 DAG.getIntPtrConstant(0, dl));
15130 if (DestVT.bitsGT(MVT::f64))
15131 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15133 // Handle final rounding.
15137 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15138 const X86Subtarget &Subtarget, SDLoc &DL) {
15139 if (Op.getSimpleValueType() != MVT::v2f64)
15142 SDValue N0 = Op.getOperand(0);
15143 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15145 // Legalize to v4i32 type.
15146 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15147 DAG.getUNDEF(MVT::v2i32));
15149 if (Subtarget.hasAVX512())
15150 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15152 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15153 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15154 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15155 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15157 // Two to the power of half-word-size.
15158 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15160 // Clear upper part of LO, lower HI.
15161 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15162 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15164 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15165 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15166 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15168 // Add the two halves.
15169 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15172 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15173 const X86Subtarget &Subtarget) {
15174 // The algorithm is the following:
15175 // #ifdef __SSE4_1__
15176 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15177 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15178 // (uint4) 0x53000000, 0xaa);
15180 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15181 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15183 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15184 // return (float4) lo + fhi;
15186 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15187 // reassociate the two FADDs, and if we do that, the algorithm fails
15188 // spectacularly (PR24512).
15189 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15190 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15191 // there's also the MachineCombiner reassociations happening on Machine IR.
15192 if (DAG.getTarget().Options.UnsafeFPMath)
15196 SDValue V = Op->getOperand(0);
15197 MVT VecIntVT = V.getSimpleValueType();
15198 bool Is128 = VecIntVT == MVT::v4i32;
15199 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15200 // If we convert to something else than the supported type, e.g., to v4f64,
15202 if (VecFloatVT != Op->getSimpleValueType(0))
15205 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15206 "Unsupported custom type");
15208 // In the #idef/#else code, we have in common:
15209 // - The vector of constants:
15215 // Create the splat vector for 0x4b000000.
15216 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15217 // Create the splat vector for 0x53000000.
15218 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15220 // Create the right shift.
15221 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15222 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15225 if (Subtarget.hasSSE41()) {
15226 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15227 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15228 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15229 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15230 // Low will be bitcasted right away, so do not bother bitcasting back to its
15232 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15233 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15234 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15235 // (uint4) 0x53000000, 0xaa);
15236 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15237 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15238 // High will be bitcasted right away, so do not bother bitcasting back to
15239 // its original type.
15240 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15241 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15243 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15244 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15245 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15246 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15248 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15249 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15252 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15253 SDValue VecCstFAdd = DAG.getConstantFP(
15254 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15256 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15257 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15258 // TODO: Are there any fast-math-flags to propagate here?
15260 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15261 // return (float4) lo + fhi;
15262 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15263 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15266 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15267 SelectionDAG &DAG) const {
15268 SDValue N0 = Op.getOperand(0);
15269 MVT SrcVT = N0.getSimpleValueType();
15272 if (SrcVT.getVectorElementType() == MVT::i1) {
15273 if (SrcVT == MVT::v2i1)
15274 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15275 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15276 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15277 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15278 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15281 switch (SrcVT.SimpleTy) {
15283 llvm_unreachable("Custom UINT_TO_FP is not supported!");
15288 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15289 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15290 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15293 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15296 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15299 assert(Subtarget.hasAVX512());
15300 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15301 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15305 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15306 SelectionDAG &DAG) const {
15307 SDValue N0 = Op.getOperand(0);
15309 auto PtrVT = getPointerTy(DAG.getDataLayout());
15311 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15312 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15313 // the optimization here.
15314 if (DAG.SignBitIsZero(N0))
15315 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15317 if (Op.getSimpleValueType().isVector())
15318 return lowerUINT_TO_FP_vec(Op, DAG);
15320 MVT SrcVT = N0.getSimpleValueType();
15321 MVT DstVT = Op.getSimpleValueType();
15323 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15324 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15325 // Conversions from unsigned i32 to f32/f64 are legal,
15326 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15330 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15331 return LowerUINT_TO_FP_i64(Op, DAG);
15332 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15333 return LowerUINT_TO_FP_i32(Op, DAG);
15334 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15337 // Make a 64-bit buffer, and use it to build an FILD.
15338 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15339 if (SrcVT == MVT::i32) {
15340 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15341 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15342 StackSlot, MachinePointerInfo());
15343 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15344 OffsetSlot, MachinePointerInfo());
15345 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15349 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15350 SDValue ValueToStore = Op.getOperand(0);
15351 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15352 // Bitcasting to f64 here allows us to do a single 64-bit store from
15353 // an SSE register, avoiding the store forwarding penalty that would come
15354 // with two 32-bit stores.
15355 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15356 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15357 MachinePointerInfo());
15358 // For i64 source, we need to add the appropriate power of 2 if the input
15359 // was negative. This is the same as the optimization in
15360 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15361 // we must be careful to do the computation in x87 extended precision, not
15362 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15363 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15364 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15365 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15366 MachineMemOperand::MOLoad, 8, 8);
15368 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15369 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15370 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15373 APInt FF(32, 0x5F800000ULL);
15375 // Check whether the sign bit is set.
15376 SDValue SignSet = DAG.getSetCC(
15377 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15378 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15380 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15381 SDValue FudgePtr = DAG.getConstantPool(
15382 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15384 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15385 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15386 SDValue Four = DAG.getIntPtrConstant(4, dl);
15387 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15388 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15390 // Load the value out, extending it from f32 to f80.
15391 // FIXME: Avoid the extend by constructing the right constant pool?
15392 SDValue Fudge = DAG.getExtLoad(
15393 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15394 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15395 /* Alignment = */ 4);
15396 // Extend everything to 80 bits to force it to be done on x87.
15397 // TODO: Are there any fast-math-flags to propagate here?
15398 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15399 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15400 DAG.getIntPtrConstant(0, dl));
15403 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15404 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15405 // just return an <SDValue(), SDValue()> pair.
15406 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15407 // to i16, i32 or i64, and we lower it to a legal sequence.
15408 // If lowered to the final integer result we return a <result, SDValue()> pair.
15409 // Otherwise we lower it to a sequence ending with a FIST, return a
15410 // <FIST, StackSlot> pair, and the caller is responsible for loading
15411 // the final integer result from StackSlot.
15412 std::pair<SDValue,SDValue>
15413 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15414 bool IsSigned, bool IsReplace) const {
15417 EVT DstTy = Op.getValueType();
15418 EVT TheVT = Op.getOperand(0).getValueType();
15419 auto PtrVT = getPointerTy(DAG.getDataLayout());
15421 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15422 // f16 must be promoted before using the lowering in this routine.
15423 // fp128 does not use this lowering.
15424 return std::make_pair(SDValue(), SDValue());
15427 // If using FIST to compute an unsigned i64, we'll need some fixup
15428 // to handle values above the maximum signed i64. A FIST is always
15429 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15430 bool UnsignedFixup = !IsSigned &&
15431 DstTy == MVT::i64 &&
15432 (!Subtarget.is64Bit() ||
15433 !isScalarFPTypeInSSEReg(TheVT));
15435 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15436 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15437 // The low 32 bits of the fist result will have the correct uint32 result.
15438 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15442 assert(DstTy.getSimpleVT() <= MVT::i64 &&
15443 DstTy.getSimpleVT() >= MVT::i16 &&
15444 "Unknown FP_TO_INT to lower!");
15446 // These are really Legal.
15447 if (DstTy == MVT::i32 &&
15448 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15449 return std::make_pair(SDValue(), SDValue());
15450 if (Subtarget.is64Bit() &&
15451 DstTy == MVT::i64 &&
15452 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15453 return std::make_pair(SDValue(), SDValue());
15455 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15457 MachineFunction &MF = DAG.getMachineFunction();
15458 unsigned MemSize = DstTy.getSizeInBits()/8;
15459 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15460 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15463 switch (DstTy.getSimpleVT().SimpleTy) {
15464 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15465 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15466 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15467 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15470 SDValue Chain = DAG.getEntryNode();
15471 SDValue Value = Op.getOperand(0);
15472 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15474 if (UnsignedFixup) {
15476 // Conversion to unsigned i64 is implemented with a select,
15477 // depending on whether the source value fits in the range
15478 // of a signed i64. Let Thresh be the FP equivalent of
15479 // 0x8000000000000000ULL.
15481 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15482 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15483 // Fist-to-mem64 FistSrc
15484 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15485 // to XOR'ing the high 32 bits with Adjust.
15487 // Being a power of 2, Thresh is exactly representable in all FP formats.
15488 // For X87 we'd like to use the smallest FP type for this constant, but
15489 // for DAG type consistency we have to match the FP operand type.
15491 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15492 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15493 bool LosesInfo = false;
15494 if (TheVT == MVT::f64)
15495 // The rounding mode is irrelevant as the conversion should be exact.
15496 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15498 else if (TheVT == MVT::f80)
15499 Status = Thresh.convert(APFloat::x87DoubleExtended(),
15500 APFloat::rmNearestTiesToEven, &LosesInfo);
15502 assert(Status == APFloat::opOK && !LosesInfo &&
15503 "FP conversion should have been exact");
15505 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15507 SDValue Cmp = DAG.getSetCC(DL,
15508 getSetCCResultType(DAG.getDataLayout(),
15509 *DAG.getContext(), TheVT),
15510 Value, ThreshVal, ISD::SETLT);
15511 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15512 DAG.getConstant(0, DL, MVT::i32),
15513 DAG.getConstant(0x80000000, DL, MVT::i32));
15514 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15515 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15516 *DAG.getContext(), TheVT),
15517 Value, ThreshVal, ISD::SETLT);
15518 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15521 // FIXME This causes a redundant load/store if the SSE-class value is already
15522 // in memory, such as if it is on the callstack.
15523 if (isScalarFPTypeInSSEReg(TheVT)) {
15524 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15525 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15526 MachinePointerInfo::getFixedStack(MF, SSFI));
15527 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15529 Chain, StackSlot, DAG.getValueType(TheVT)
15532 MachineMemOperand *MMO =
15533 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15534 MachineMemOperand::MOLoad, MemSize, MemSize);
15535 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15536 Chain = Value.getValue(1);
15537 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15538 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15541 MachineMemOperand *MMO =
15542 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15543 MachineMemOperand::MOStore, MemSize, MemSize);
15545 if (UnsignedFixup) {
15547 // Insert the FIST, load its result as two i32's,
15548 // and XOR the high i32 with Adjust.
15550 SDValue FistOps[] = { Chain, Value, StackSlot };
15551 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15552 FistOps, DstTy, MMO);
15555 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15556 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15559 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15560 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15562 if (Subtarget.is64Bit()) {
15563 // Join High32 and Low32 into a 64-bit result.
15564 // (High32 << 32) | Low32
15565 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15566 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15567 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15568 DAG.getConstant(32, DL, MVT::i8));
15569 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15570 return std::make_pair(Result, SDValue());
15573 SDValue ResultOps[] = { Low32, High32 };
15575 SDValue pair = IsReplace
15576 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15577 : DAG.getMergeValues(ResultOps, DL);
15578 return std::make_pair(pair, SDValue());
15580 // Build the FP_TO_INT*_IN_MEM
15581 SDValue Ops[] = { Chain, Value, StackSlot };
15582 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15584 return std::make_pair(FIST, StackSlot);
15588 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15589 const X86Subtarget &Subtarget) {
15590 MVT VT = Op->getSimpleValueType(0);
15591 SDValue In = Op->getOperand(0);
15592 MVT InVT = In.getSimpleValueType();
15595 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15596 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15598 // Optimize vectors in AVX mode:
15601 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15602 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15603 // Concat upper and lower parts.
15606 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15607 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15608 // Concat upper and lower parts.
15611 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15612 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15613 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15616 if (Subtarget.hasInt256())
15617 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15619 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15620 SDValue Undef = DAG.getUNDEF(InVT);
15621 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15622 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15623 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15625 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15626 VT.getVectorNumElements()/2);
15628 OpLo = DAG.getBitcast(HVT, OpLo);
15629 OpHi = DAG.getBitcast(HVT, OpHi);
15631 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15634 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15635 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15636 MVT VT = Op->getSimpleValueType(0);
15637 SDValue In = Op->getOperand(0);
15638 MVT InVT = In.getSimpleValueType();
15640 unsigned NumElts = VT.getVectorNumElements();
15642 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15643 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15644 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15646 if (InVT.getVectorElementType() != MVT::i1)
15649 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15651 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15652 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15655 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15657 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15659 SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
15661 return SelectedVal;
15662 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15665 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15666 SelectionDAG &DAG) {
15667 if (Subtarget.hasFp256())
15668 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15674 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15675 SelectionDAG &DAG) {
15677 MVT VT = Op.getSimpleValueType();
15678 SDValue In = Op.getOperand(0);
15679 MVT SVT = In.getSimpleValueType();
15681 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15682 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15684 if (Subtarget.hasFp256())
15685 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15688 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15689 VT.getVectorNumElements() != SVT.getVectorNumElements());
15693 /// Helper to recursively truncate vector elements in half with PACKSS.
15694 /// It makes use of the fact that vector comparison results will be all-zeros
15695 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15696 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15697 /// within each 128-bit lane.
15698 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15701 const X86Subtarget &Subtarget) {
15702 // Requires SSE2 but AVX512 has fast truncate.
15703 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15706 EVT SrcVT = In.getValueType();
15708 // No truncation required, we might get here due to recursive calls.
15709 if (SrcVT == DstVT)
15712 // We only support vector truncation to 128bits or greater from a
15713 // 256bits or greater source.
15714 if ((DstVT.getSizeInBits() % 128) != 0)
15716 if ((SrcVT.getSizeInBits() % 256) != 0)
15719 unsigned NumElems = SrcVT.getVectorNumElements();
15720 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15721 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15724 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15726 // Extract lower/upper subvectors.
15727 unsigned NumSubElts = NumElems / 2;
15728 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15729 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15730 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15732 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15733 if (SrcVT.is256BitVector()) {
15734 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15735 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15736 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15737 return DAG.getBitcast(DstVT, Res);
15740 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15741 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15742 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15743 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15744 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15745 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15747 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15748 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15749 Res = DAG.getBitcast(MVT::v4i64, Res);
15750 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15752 if (DstVT.is256BitVector())
15753 return DAG.getBitcast(DstVT, Res);
15755 // If 512bit -> 128bit truncate another stage.
15756 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15757 Res = DAG.getBitcast(PackedVT, Res);
15758 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15761 // Recursively pack lower/upper subvectors, concat result and pack again.
15762 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15763 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15764 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15765 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15767 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15768 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15769 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15772 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15773 const X86Subtarget &Subtarget) {
15776 MVT VT = Op.getSimpleValueType();
15777 SDValue In = Op.getOperand(0);
15778 MVT InVT = In.getSimpleValueType();
15780 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15782 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15783 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15784 if (InVT.getScalarSizeInBits() <= 16) {
15785 if (Subtarget.hasBWI()) {
15786 // legal, will go to VPMOVB2M, VPMOVW2M
15787 // Shift packed bytes not supported natively, bitcast to word
15788 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15789 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15790 DAG.getBitcast(ExtVT, In),
15791 DAG.getConstant(ShiftInx, DL, ExtVT));
15792 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15793 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15795 // Use TESTD/Q, extended vector to packed dword/qword.
15796 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15797 "Unexpected vector type.");
15798 unsigned NumElts = InVT.getVectorNumElements();
15799 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15800 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15802 ShiftInx = InVT.getScalarSizeInBits() - 1;
15805 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15806 DAG.getConstant(ShiftInx, DL, InVT));
15807 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15810 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15812 MVT VT = Op.getSimpleValueType();
15813 SDValue In = Op.getOperand(0);
15814 MVT InVT = In.getSimpleValueType();
15816 if (VT == MVT::i1) {
15817 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15818 "Invalid scalar TRUNCATE operation");
15819 if (InVT.getSizeInBits() >= 32)
15821 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15822 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15824 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15825 "Invalid TRUNCATE operation");
15827 if (VT.getVectorElementType() == MVT::i1)
15828 return LowerTruncateVecI1(Op, DAG, Subtarget);
15830 // vpmovqb/w/d, vpmovdb/w, vpmovwb
15831 if (Subtarget.hasAVX512()) {
15832 // word to byte only under BWI
15833 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15834 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15835 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
15836 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15839 // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
15840 if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
15841 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15844 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
15845 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15846 if (Subtarget.hasInt256()) {
15847 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15848 In = DAG.getBitcast(MVT::v8i32, In);
15849 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
15850 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15851 DAG.getIntPtrConstant(0, DL));
15854 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15855 DAG.getIntPtrConstant(0, DL));
15856 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15857 DAG.getIntPtrConstant(2, DL));
15858 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15859 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15860 static const int ShufMask[] = {0, 2, 4, 6};
15861 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
15864 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
15865 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
15866 if (Subtarget.hasInt256()) {
15867 In = DAG.getBitcast(MVT::v32i8, In);
15869 // The PSHUFB mask:
15870 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
15871 -1, -1, -1, -1, -1, -1, -1, -1,
15872 16, 17, 20, 21, 24, 25, 28, 29,
15873 -1, -1, -1, -1, -1, -1, -1, -1 };
15874 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
15875 In = DAG.getBitcast(MVT::v4i64, In);
15877 static const int ShufMask2[] = {0, 2, -1, -1};
15878 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
15879 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15880 DAG.getIntPtrConstant(0, DL));
15881 return DAG.getBitcast(VT, In);
15884 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15885 DAG.getIntPtrConstant(0, DL));
15887 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15888 DAG.getIntPtrConstant(4, DL));
15890 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
15891 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
15893 // The PSHUFB mask:
15894 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
15895 -1, -1, -1, -1, -1, -1, -1, -1};
15897 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
15898 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
15900 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15901 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15903 // The MOVLHPS Mask:
15904 static const int ShufMask2[] = {0, 1, 4, 5};
15905 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15906 return DAG.getBitcast(MVT::v8i16, res);
15909 // Handle truncation of V256 to V128 using shuffles.
15910 if (!VT.is128BitVector() || !InVT.is256BitVector())
15913 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
15915 unsigned NumElems = VT.getVectorNumElements();
15916 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15918 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15919 // Prepare truncation shuffle mask
15920 for (unsigned i = 0; i != NumElems; ++i)
15921 MaskVec[i] = i * 2;
15922 In = DAG.getBitcast(NVT, In);
15923 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
15924 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15925 DAG.getIntPtrConstant(0, DL));
15928 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
15929 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
15930 MVT VT = Op.getSimpleValueType();
15932 if (VT.isVector()) {
15933 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
15934 SDValue Src = Op.getOperand(0);
15936 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
15937 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
15938 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
15939 DAG.getUNDEF(MVT::v2f32)));
15945 assert(!VT.isVector());
15947 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15948 IsSigned, /*IsReplace=*/ false);
15949 SDValue FIST = Vals.first, StackSlot = Vals.second;
15950 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
15951 if (!FIST.getNode())
15954 if (StackSlot.getNode())
15955 // Load the result.
15956 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
15958 // The node is the result.
15962 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
15964 MVT VT = Op.getSimpleValueType();
15965 SDValue In = Op.getOperand(0);
15966 MVT SVT = In.getSimpleValueType();
15968 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
15970 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
15971 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
15972 In, DAG.getUNDEF(SVT)));
15975 /// The only differences between FABS and FNEG are the mask and the logic op.
15976 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
15977 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
15978 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
15979 "Wrong opcode for lowering FABS or FNEG.");
15981 bool IsFABS = (Op.getOpcode() == ISD::FABS);
15983 // If this is a FABS and it has an FNEG user, bail out to fold the combination
15984 // into an FNABS. We'll lower the FABS after that if it is still in use.
15986 for (SDNode *User : Op->uses())
15987 if (User->getOpcode() == ISD::FNEG)
15991 MVT VT = Op.getSimpleValueType();
15993 bool IsF128 = (VT == MVT::f128);
15995 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
15996 // decide if we should generate a 16-byte constant mask when we only need 4 or
15997 // 8 bytes for the scalar case.
16002 if (VT.isVector()) {
16004 EltVT = VT.getVectorElementType();
16005 } else if (IsF128) {
16006 // SSE instructions are used for optimized f128 logical operations.
16007 LogicVT = MVT::f128;
16010 // There are no scalar bitwise logical SSE/AVX instructions, so we
16011 // generate a 16-byte vector constant and logic op even for the scalar case.
16012 // Using a 16-byte mask allows folding the load of the mask with
16013 // the logic op, so it can save (~4 bytes) on code size.
16014 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16018 unsigned EltBits = EltVT.getSizeInBits();
16019 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16021 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16022 const fltSemantics &Sem =
16023 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16024 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16025 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16027 SDValue Op0 = Op.getOperand(0);
16028 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16030 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16031 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16033 if (VT.isVector() || IsF128)
16034 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16036 // For the scalar case extend to a 128-bit vector, perform the logic op,
16037 // and extract the scalar result back out.
16038 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16039 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16040 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16041 DAG.getIntPtrConstant(0, dl));
16044 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16045 SDValue Mag = Op.getOperand(0);
16046 SDValue Sign = Op.getOperand(1);
16049 // If the sign operand is smaller, extend it first.
16050 MVT VT = Op.getSimpleValueType();
16051 if (Sign.getSimpleValueType().bitsLT(VT))
16052 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16054 // And if it is bigger, shrink it first.
16055 if (Sign.getSimpleValueType().bitsGT(VT))
16056 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16058 // At this point the operands and the result should have the same
16059 // type, and that won't be f80 since that is not custom lowered.
16060 bool IsF128 = (VT == MVT::f128);
16061 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16062 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16063 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16064 "Unexpected type in LowerFCOPYSIGN");
16066 MVT EltVT = VT.getScalarType();
16067 const fltSemantics &Sem =
16068 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16069 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16071 // Perform all scalar logic operations as 16-byte vectors because there are no
16072 // scalar FP logic instructions in SSE.
16073 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16074 // unnecessary splats, but we might miss load folding opportunities. Should
16075 // this decision be based on OptimizeForSize?
16076 bool IsFakeVector = !VT.isVector() && !IsF128;
16079 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16081 // The mask constants are automatically splatted for vector types.
16082 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16083 SDValue SignMask = DAG.getConstantFP(
16084 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16085 SDValue MagMask = DAG.getConstantFP(
16086 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16088 // First, clear all bits but the sign bit from the second operand (sign).
16090 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16091 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16093 // Next, clear the sign bit from the first operand (magnitude).
16094 // TODO: If we had general constant folding for FP logic ops, this check
16095 // wouldn't be necessary.
16097 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16098 APFloat APF = Op0CN->getValueAPF();
16100 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16102 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16104 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16105 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16108 // OR the magnitude value with the sign bit.
16109 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16110 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16111 DAG.getIntPtrConstant(0, dl));
16114 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16115 SDValue N0 = Op.getOperand(0);
16117 MVT VT = Op.getSimpleValueType();
16119 MVT OpVT = N0.getSimpleValueType();
16120 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16121 "Unexpected type for FGETSIGN");
16123 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16124 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16125 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16126 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16127 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16128 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16132 // Check whether an OR'd tree is PTEST-able.
16133 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16134 SelectionDAG &DAG) {
16135 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16137 if (!Subtarget.hasSSE41())
16140 if (!Op->hasOneUse())
16143 SDNode *N = Op.getNode();
16146 SmallVector<SDValue, 8> Opnds;
16147 DenseMap<SDValue, unsigned> VecInMap;
16148 SmallVector<SDValue, 8> VecIns;
16149 EVT VT = MVT::Other;
16151 // Recognize a special case where a vector is casted into wide integer to
16153 Opnds.push_back(N->getOperand(0));
16154 Opnds.push_back(N->getOperand(1));
16156 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16157 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16158 // BFS traverse all OR'd operands.
16159 if (I->getOpcode() == ISD::OR) {
16160 Opnds.push_back(I->getOperand(0));
16161 Opnds.push_back(I->getOperand(1));
16162 // Re-evaluate the number of nodes to be traversed.
16163 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16167 // Quit if a non-EXTRACT_VECTOR_ELT
16168 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16171 // Quit if without a constant index.
16172 SDValue Idx = I->getOperand(1);
16173 if (!isa<ConstantSDNode>(Idx))
16176 SDValue ExtractedFromVec = I->getOperand(0);
16177 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16178 if (M == VecInMap.end()) {
16179 VT = ExtractedFromVec.getValueType();
16180 // Quit if not 128/256-bit vector.
16181 if (!VT.is128BitVector() && !VT.is256BitVector())
16183 // Quit if not the same type.
16184 if (VecInMap.begin() != VecInMap.end() &&
16185 VT != VecInMap.begin()->first.getValueType())
16187 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16188 VecIns.push_back(ExtractedFromVec);
16190 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16193 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16194 "Not extracted from 128-/256-bit vector.");
16196 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16198 for (DenseMap<SDValue, unsigned>::const_iterator
16199 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16200 // Quit if not all elements are used.
16201 if (I->second != FullMask)
16205 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16207 // Cast all vectors into TestVT for PTEST.
16208 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16209 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16211 // If more than one full vector is evaluated, OR them first before PTEST.
16212 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16213 // Each iteration will OR 2 nodes and append the result until there is only
16214 // 1 node left, i.e. the final OR'd value of all vectors.
16215 SDValue LHS = VecIns[Slot];
16216 SDValue RHS = VecIns[Slot + 1];
16217 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16220 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16223 /// \brief return true if \c Op has a use that doesn't just read flags.
16224 static bool hasNonFlagsUse(SDValue Op) {
16225 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16227 SDNode *User = *UI;
16228 unsigned UOpNo = UI.getOperandNo();
16229 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16230 // Look pass truncate.
16231 UOpNo = User->use_begin().getOperandNo();
16232 User = *User->use_begin();
16235 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16236 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16242 // Emit KTEST instruction for bit vectors on AVX-512
16243 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16244 const X86Subtarget &Subtarget) {
16245 if (Op.getOpcode() == ISD::BITCAST) {
16246 auto hasKTEST = [&](MVT VT) {
16247 unsigned SizeInBits = VT.getSizeInBits();
16248 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16249 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16251 SDValue Op0 = Op.getOperand(0);
16252 MVT Op0VT = Op0.getValueType().getSimpleVT();
16253 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16255 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16260 /// Emit nodes that will be selected as "test Op0,Op0", or something
16262 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16263 SelectionDAG &DAG) const {
16264 if (Op.getValueType() == MVT::i1) {
16265 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16266 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16267 DAG.getConstant(0, dl, MVT::i8));
16269 // CF and OF aren't always set the way we want. Determine which
16270 // of these we need.
16271 bool NeedCF = false;
16272 bool NeedOF = false;
16275 case X86::COND_A: case X86::COND_AE:
16276 case X86::COND_B: case X86::COND_BE:
16279 case X86::COND_G: case X86::COND_GE:
16280 case X86::COND_L: case X86::COND_LE:
16281 case X86::COND_O: case X86::COND_NO: {
16282 // Check if we really need to set the
16283 // Overflow flag. If NoSignedWrap is present
16284 // that is not actually needed.
16285 switch (Op->getOpcode()) {
16290 if (Op.getNode()->getFlags().hasNoSignedWrap())
16300 // See if we can use the EFLAGS value from the operand instead of
16301 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16302 // we prove that the arithmetic won't overflow, we can't use OF or CF.
16303 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16304 // Emit KTEST for bit vectors
16305 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16307 // Emit a CMP with 0, which is the TEST pattern.
16308 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16309 DAG.getConstant(0, dl, Op.getValueType()));
16311 unsigned Opcode = 0;
16312 unsigned NumOperands = 0;
16314 // Truncate operations may prevent the merge of the SETCC instruction
16315 // and the arithmetic instruction before it. Attempt to truncate the operands
16316 // of the arithmetic instruction and use a reduced bit-width instruction.
16317 bool NeedTruncation = false;
16318 SDValue ArithOp = Op;
16319 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16320 SDValue Arith = Op->getOperand(0);
16321 // Both the trunc and the arithmetic op need to have one user each.
16322 if (Arith->hasOneUse())
16323 switch (Arith.getOpcode()) {
16330 NeedTruncation = true;
16336 // Sometimes flags can be set either with an AND or with an SRL/SHL
16337 // instruction. SRL/SHL variant should be preferred for masks longer than this
16339 const int ShiftToAndMaxMaskWidth = 32;
16340 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16342 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16343 // which may be the result of a CAST. We use the variable 'Op', which is the
16344 // non-casted variable when we check for possible users.
16345 switch (ArithOp.getOpcode()) {
16347 // Due to an isel shortcoming, be conservative if this add is likely to be
16348 // selected as part of a load-modify-store instruction. When the root node
16349 // in a match is a store, isel doesn't know how to remap non-chain non-flag
16350 // uses of other nodes in the match, such as the ADD in this case. This
16351 // leads to the ADD being left around and reselected, with the result being
16352 // two adds in the output. Alas, even if none our users are stores, that
16353 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
16354 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
16355 // climbing the DAG back to the root, and it doesn't seem to be worth the
16357 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16358 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16359 if (UI->getOpcode() != ISD::CopyToReg &&
16360 UI->getOpcode() != ISD::SETCC &&
16361 UI->getOpcode() != ISD::STORE)
16364 if (ConstantSDNode *C =
16365 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16366 // An add of one will be selected as an INC.
16367 if (C->isOne() && !Subtarget.slowIncDec()) {
16368 Opcode = X86ISD::INC;
16373 // An add of negative one (subtract of one) will be selected as a DEC.
16374 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16375 Opcode = X86ISD::DEC;
16381 // Otherwise use a regular EFLAGS-setting add.
16382 Opcode = X86ISD::ADD;
16387 // If we have a constant logical shift that's only used in a comparison
16388 // against zero turn it into an equivalent AND. This allows turning it into
16389 // a TEST instruction later.
16390 if (ZeroCheck && Op->hasOneUse() &&
16391 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16392 EVT VT = Op.getValueType();
16393 unsigned BitWidth = VT.getSizeInBits();
16394 unsigned ShAmt = Op->getConstantOperandVal(1);
16395 if (ShAmt >= BitWidth) // Avoid undefined shifts.
16397 APInt Mask = ArithOp.getOpcode() == ISD::SRL
16398 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16399 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16400 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16402 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16403 DAG.getConstant(Mask, dl, VT));
16408 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16409 // because a TEST instruction will be better. However, AND should be
16410 // preferred if the instruction can be combined into ANDN.
16411 if (!hasNonFlagsUse(Op)) {
16412 SDValue Op0 = ArithOp->getOperand(0);
16413 SDValue Op1 = ArithOp->getOperand(1);
16414 EVT VT = ArithOp.getValueType();
16415 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16416 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16417 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16419 // If we cannot select an ANDN instruction, check if we can replace
16420 // AND+IMM64 with a shift before giving up. This is possible for masks
16421 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16422 if (!isProperAndn) {
16426 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16427 auto *CN = dyn_cast<ConstantSDNode>(Op1);
16431 const APInt &Mask = CN->getAPIntValue();
16432 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16433 break; // Prefer TEST instruction.
16435 unsigned BitWidth = Mask.getBitWidth();
16436 unsigned LeadingOnes = Mask.countLeadingOnes();
16437 unsigned TrailingZeros = Mask.countTrailingZeros();
16439 if (LeadingOnes + TrailingZeros == BitWidth) {
16440 assert(TrailingZeros < VT.getSizeInBits() &&
16441 "Shift amount should be less than the type width");
16442 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16443 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16444 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16448 unsigned LeadingZeros = Mask.countLeadingZeros();
16449 unsigned TrailingOnes = Mask.countTrailingOnes();
16451 if (LeadingZeros + TrailingOnes == BitWidth) {
16452 assert(LeadingZeros < VT.getSizeInBits() &&
16453 "Shift amount should be less than the type width");
16454 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16455 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16456 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16467 // Due to the ISEL shortcoming noted above, be conservative if this op is
16468 // likely to be selected as part of a load-modify-store instruction.
16469 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16470 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16471 if (UI->getOpcode() == ISD::STORE)
16474 // Otherwise use a regular EFLAGS-setting instruction.
16475 switch (ArithOp.getOpcode()) {
16476 default: llvm_unreachable("unexpected operator!");
16477 case ISD::SUB: Opcode = X86ISD::SUB; break;
16478 case ISD::XOR: Opcode = X86ISD::XOR; break;
16479 case ISD::AND: Opcode = X86ISD::AND; break;
16481 if (!NeedTruncation && ZeroCheck) {
16482 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16485 Opcode = X86ISD::OR;
16499 return SDValue(Op.getNode(), 1);
16505 // If we found that truncation is beneficial, perform the truncation and
16507 if (NeedTruncation) {
16508 EVT VT = Op.getValueType();
16509 SDValue WideVal = Op->getOperand(0);
16510 EVT WideVT = WideVal.getValueType();
16511 unsigned ConvertedOp = 0;
16512 // Use a target machine opcode to prevent further DAGCombine
16513 // optimizations that may separate the arithmetic operations
16514 // from the setcc node.
16515 switch (WideVal.getOpcode()) {
16517 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16518 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16519 case ISD::AND: ConvertedOp = X86ISD::AND; break;
16520 case ISD::OR: ConvertedOp = X86ISD::OR; break;
16521 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16525 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16526 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16527 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16528 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16529 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16535 // Emit KTEST for bit vectors
16536 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16539 // Emit a CMP with 0, which is the TEST pattern.
16540 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16541 DAG.getConstant(0, dl, Op.getValueType()));
16543 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16544 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16546 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16547 DAG.ReplaceAllUsesWith(Op, New);
16548 return SDValue(New.getNode(), 1);
16551 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16553 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16554 const SDLoc &dl, SelectionDAG &DAG) const {
16555 if (isNullConstant(Op1))
16556 return EmitTest(Op0, X86CC, dl, DAG);
16558 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16559 "Unexpected comparison operation for MVT::i1 operands");
16561 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16562 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16563 // Only promote the compare up to I32 if it is a 16 bit operation
16564 // with an immediate. 16 bit immediates are to be avoided.
16565 if ((Op0.getValueType() == MVT::i16 &&
16566 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16567 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16568 !Subtarget.isAtom()) {
16569 unsigned ExtendOp =
16570 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16571 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16572 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16574 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16575 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16576 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16578 return SDValue(Sub.getNode(), 1);
16580 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16583 /// Convert a comparison if required by the subtarget.
16584 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16585 SelectionDAG &DAG) const {
16586 // If the subtarget does not support the FUCOMI instruction, floating-point
16587 // comparisons have to be converted.
16588 if (Subtarget.hasCMov() ||
16589 Cmp.getOpcode() != X86ISD::CMP ||
16590 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16591 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16594 // The instruction selector will select an FUCOM instruction instead of
16595 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16596 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16597 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16599 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16600 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16601 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16602 DAG.getConstant(8, dl, MVT::i8));
16603 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16605 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16606 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16607 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16610 /// Check if replacement of SQRT with RSQRT should be disabled.
16611 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16612 EVT VT = Op.getValueType();
16614 // We never want to use both SQRT and RSQRT instructions for the same input.
16615 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16619 return Subtarget.hasFastVectorFSQRT();
16620 return Subtarget.hasFastScalarFSQRT();
16623 /// The minimum architected relative accuracy is 2^-12. We need one
16624 /// Newton-Raphson step to have a good float result (24 bits of precision).
16625 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16626 SelectionDAG &DAG, int Enabled,
16627 int &RefinementSteps,
16628 bool &UseOneConstNR,
16629 bool Reciprocal) const {
16630 EVT VT = Op.getValueType();
16632 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16633 // TODO: Add support for AVX512 (v16f32).
16634 // It is likely not profitable to do this for f64 because a double-precision
16635 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16636 // instructions: convert to single, rsqrtss, convert back to double, refine
16637 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16638 // along with FMA, this could be a throughput win.
16639 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16640 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16641 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16642 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16643 RefinementSteps = 1;
16645 UseOneConstNR = false;
16646 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16651 /// The minimum architected relative accuracy is 2^-12. We need one
16652 /// Newton-Raphson step to have a good float result (24 bits of precision).
16653 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16655 int &RefinementSteps) const {
16656 EVT VT = Op.getValueType();
16658 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16659 // TODO: Add support for AVX512 (v16f32).
16660 // It is likely not profitable to do this for f64 because a double-precision
16661 // reciprocal estimate with refinement on x86 prior to FMA requires
16662 // 15 instructions: convert to single, rcpss, convert back to double, refine
16663 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16664 // along with FMA, this could be a throughput win.
16666 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16667 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16668 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16669 // Enable estimate codegen with 1 refinement step for vector division.
16670 // Scalar division estimates are disabled because they break too much
16671 // real-world code. These defaults are intended to match GCC behavior.
16672 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16675 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16676 RefinementSteps = 1;
16678 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16683 /// If we have at least two divisions that use the same divisor, convert to
16684 /// multiplication by a reciprocal. This may need to be adjusted for a given
16685 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16686 /// This is because we still need one division to calculate the reciprocal and
16687 /// then we need two multiplies by that reciprocal as replacements for the
16688 /// original divisions.
16689 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16693 /// Helper for creating a X86ISD::SETCC node.
16694 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16695 SelectionDAG &DAG) {
16696 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16697 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16700 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16701 /// according to equal/not-equal condition code \p CC.
16702 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16703 const SDLoc &dl, SelectionDAG &DAG) {
16704 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16705 // instruction. Since the shift amount is in-range-or-undefined, we know
16706 // that doing a bittest on the i32 value is ok. We extend to i32 because
16707 // the encoding for the i16 version is larger than the i32 version.
16708 // Also promote i16 to i32 for performance / code size reason.
16709 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16710 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16712 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16713 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16714 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16715 // known to be zero.
16716 if (Src.getValueType() == MVT::i64 &&
16717 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16718 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16720 // If the operand types disagree, extend the shift amount to match. Since
16721 // BT ignores high bits (like shifts) we can use anyextend.
16722 if (Src.getValueType() != BitNo.getValueType())
16723 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16725 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16726 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16727 return getSETCC(Cond, BT, dl , DAG);
16730 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16731 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16732 const SDLoc &dl, SelectionDAG &DAG) {
16733 SDValue Op0 = And.getOperand(0);
16734 SDValue Op1 = And.getOperand(1);
16735 if (Op0.getOpcode() == ISD::TRUNCATE)
16736 Op0 = Op0.getOperand(0);
16737 if (Op1.getOpcode() == ISD::TRUNCATE)
16738 Op1 = Op1.getOperand(0);
16741 if (Op1.getOpcode() == ISD::SHL)
16742 std::swap(Op0, Op1);
16743 if (Op0.getOpcode() == ISD::SHL) {
16744 if (isOneConstant(Op0.getOperand(0))) {
16745 // If we looked past a truncate, check that it's only truncating away
16747 unsigned BitWidth = Op0.getValueSizeInBits();
16748 unsigned AndBitWidth = And.getValueSizeInBits();
16749 if (BitWidth > AndBitWidth) {
16751 DAG.computeKnownBits(Op0, Known);
16752 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
16756 RHS = Op0.getOperand(1);
16758 } else if (Op1.getOpcode() == ISD::Constant) {
16759 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16760 uint64_t AndRHSVal = AndRHS->getZExtValue();
16761 SDValue AndLHS = Op0;
16763 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16764 LHS = AndLHS.getOperand(0);
16765 RHS = AndLHS.getOperand(1);
16768 // Use BT if the immediate can't be encoded in a TEST instruction.
16769 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16771 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16776 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16781 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16782 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16783 const SDLoc &dl, SelectionDAG &DAG) {
16785 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16786 "Expected TRUNCATE to i1 node");
16788 if (Op.getOperand(0).getOpcode() != ISD::SRL)
16791 SDValue ShiftRight = Op.getOperand(0);
16792 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16796 /// Result of 'and' or 'trunc to i1' is compared against zero.
16797 /// Change to a BT node if possible.
16798 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16799 const SDLoc &dl, SelectionDAG &DAG) const {
16800 if (Op.getOpcode() == ISD::AND)
16801 return LowerAndToBT(Op, CC, dl, DAG);
16802 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16803 return LowerTruncateToBT(Op, CC, dl, DAG);
16807 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16809 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16814 // SSE Condition code mapping:
16823 switch (SetCCOpcode) {
16824 default: llvm_unreachable("Unexpected SETCC condition");
16826 case ISD::SETEQ: SSECC = 0; break;
16828 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
16830 case ISD::SETOLT: SSECC = 1; break;
16832 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
16834 case ISD::SETOLE: SSECC = 2; break;
16835 case ISD::SETUO: SSECC = 3; break;
16837 case ISD::SETNE: SSECC = 4; break;
16838 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
16839 case ISD::SETUGE: SSECC = 5; break;
16840 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
16841 case ISD::SETUGT: SSECC = 6; break;
16842 case ISD::SETO: SSECC = 7; break;
16844 case ISD::SETONE: SSECC = 8; break;
16847 std::swap(Op0, Op1);
16852 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16853 /// concatenate the result back.
16854 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16855 MVT VT = Op.getSimpleValueType();
16857 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16858 "Unsupported value type for operation");
16860 unsigned NumElems = VT.getVectorNumElements();
16862 SDValue CC = Op.getOperand(2);
16864 // Extract the LHS vectors
16865 SDValue LHS = Op.getOperand(0);
16866 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16867 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16869 // Extract the RHS vectors
16870 SDValue RHS = Op.getOperand(1);
16871 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16872 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16874 // Issue the operation on the smaller types and concatenate the result back
16875 MVT EltVT = VT.getVectorElementType();
16876 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16877 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16878 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16879 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16882 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16883 SDValue Op0 = Op.getOperand(0);
16884 SDValue Op1 = Op.getOperand(1);
16885 SDValue CC = Op.getOperand(2);
16886 MVT VT = Op.getSimpleValueType();
16889 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16890 "Unexpected type for boolean compare operation");
16891 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16892 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
16893 DAG.getConstant(-1, dl, VT));
16894 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
16895 DAG.getConstant(-1, dl, VT));
16896 switch (SetCCOpcode) {
16897 default: llvm_unreachable("Unexpected SETCC condition");
16899 // (x == y) -> ~(x ^ y)
16900 return DAG.getNode(ISD::XOR, dl, VT,
16901 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
16902 DAG.getConstant(-1, dl, VT));
16904 // (x != y) -> (x ^ y)
16905 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
16908 // (x > y) -> (x & ~y)
16909 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
16912 // (x < y) -> (~x & y)
16913 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
16916 // (x <= y) -> (~x | y)
16917 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
16920 // (x >=y) -> (x | ~y)
16921 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
16925 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16927 SDValue Op0 = Op.getOperand(0);
16928 SDValue Op1 = Op.getOperand(1);
16929 SDValue CC = Op.getOperand(2);
16930 MVT VT = Op.getSimpleValueType();
16933 assert(VT.getVectorElementType() == MVT::i1 &&
16934 "Cannot set masked compare for this operation");
16936 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16938 bool Unsigned = false;
16941 switch (SetCCOpcode) {
16942 default: llvm_unreachable("Unexpected SETCC condition");
16943 case ISD::SETNE: SSECC = 4; break;
16944 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
16945 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
16946 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
16947 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
16948 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
16949 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
16950 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
16951 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
16952 case ISD::SETLE: SSECC = 2; break;
16956 std::swap(Op0, Op1);
16958 return DAG.getNode(Opc, dl, VT, Op0, Op1);
16959 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
16960 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16961 DAG.getConstant(SSECC, dl, MVT::i8));
16964 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
16965 /// operand \p Op1. If non-trivial (for example because it's not constant)
16966 /// return an empty value.
16967 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
16968 SelectionDAG &DAG) {
16969 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
16973 MVT VT = Op1.getSimpleValueType();
16974 MVT EVT = VT.getVectorElementType();
16975 unsigned n = VT.getVectorNumElements();
16976 SmallVector<SDValue, 8> ULTOp1;
16978 for (unsigned i = 0; i < n; ++i) {
16979 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
16980 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
16983 // Avoid underflow.
16984 APInt Val = Elt->getAPIntValue();
16988 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
16991 return DAG.getBuildVector(VT, dl, ULTOp1);
16994 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
16995 SelectionDAG &DAG) {
16996 SDValue Op0 = Op.getOperand(0);
16997 SDValue Op1 = Op.getOperand(1);
16998 SDValue CC = Op.getOperand(2);
16999 MVT VT = Op.getSimpleValueType();
17000 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17001 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17006 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17007 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17011 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17012 assert(VT.getVectorNumElements() <= 16);
17013 Opc = X86ISD::CMPM;
17015 Opc = X86ISD::CMPP;
17016 // The SSE/AVX packed FP comparison nodes are defined with a
17017 // floating-point vector result that matches the operand type. This allows
17018 // them to work with an SSE1 target (integer vector types are not legal).
17019 VT = Op0.getSimpleValueType();
17022 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17023 // emit two comparisons and a logic op to tie them together.
17024 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17027 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
17029 // LLVM predicate is SETUEQ or SETONE.
17031 unsigned CombineOpc;
17032 if (SetCCOpcode == ISD::SETUEQ) {
17035 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17036 static_cast<unsigned>(ISD::OR);
17038 assert(SetCCOpcode == ISD::SETONE);
17041 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17042 static_cast<unsigned>(ISD::AND);
17045 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17046 DAG.getConstant(CC0, dl, MVT::i8));
17047 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17048 DAG.getConstant(CC1, dl, MVT::i8));
17049 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17051 // Handle all other FP comparisons here.
17052 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17053 DAG.getConstant(SSECC, dl, MVT::i8));
17056 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17057 // result type of SETCC. The bitcast is expected to be optimized away
17058 // during combining/isel.
17059 if (Opc == X86ISD::CMPP)
17060 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17065 MVT VTOp0 = Op0.getSimpleValueType();
17066 assert(VTOp0 == Op1.getSimpleValueType() &&
17067 "Expected operands with same type!");
17068 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17069 "Invalid number of packed elements for source and destination!");
17071 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17072 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17073 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17074 // legalizer firstly checks if the first operand in input to the setcc has
17075 // a legal type. If so, then it promotes the return type to that same type.
17076 // Otherwise, the return type is promoted to the 'next legal type' which,
17077 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17079 // We reach this code only if the following two conditions are met:
17080 // 1. Both return type and operand type have been promoted to wider types
17081 // by the type legalizer.
17082 // 2. The original operand type has been promoted to a 256-bit vector.
17084 // Note that condition 2. only applies for AVX targets.
17085 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
17086 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17089 // The non-AVX512 code below works under the assumption that source and
17090 // destination types are the same.
17091 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17092 "Value types for source and destination must be the same!");
17094 // Break 256-bit integer vector compare into smaller ones.
17095 if (VT.is256BitVector() && !Subtarget.hasInt256())
17096 return Lower256IntVSETCC(Op, DAG);
17098 // Operands are boolean (vectors of i1)
17099 MVT OpVT = Op1.getSimpleValueType();
17100 if (OpVT.getVectorElementType() == MVT::i1)
17101 return LowerBoolVSETCC_AVX512(Op, DAG);
17103 // The result is boolean, but operands are int/float
17104 if (VT.getVectorElementType() == MVT::i1) {
17105 // In AVX-512 architecture setcc returns mask with i1 elements,
17106 // But there is no compare instruction for i8 and i16 elements in KNL.
17107 // In this case use SSE compare
17108 bool UseAVX512Inst =
17109 (OpVT.is512BitVector() ||
17110 OpVT.getScalarSizeInBits() >= 32 ||
17111 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17114 return LowerIntVSETCC_AVX512(Op, DAG);
17116 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17117 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17120 // Lower using XOP integer comparisons.
17121 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17122 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17123 // Translate compare code to XOP PCOM compare mode.
17124 unsigned CmpMode = 0;
17125 switch (SetCCOpcode) {
17126 default: llvm_unreachable("Unexpected SETCC condition");
17128 case ISD::SETLT: CmpMode = 0x00; break;
17130 case ISD::SETLE: CmpMode = 0x01; break;
17132 case ISD::SETGT: CmpMode = 0x02; break;
17134 case ISD::SETGE: CmpMode = 0x03; break;
17135 case ISD::SETEQ: CmpMode = 0x04; break;
17136 case ISD::SETNE: CmpMode = 0x05; break;
17139 // Are we comparing unsigned or signed integers?
17140 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
17141 ? X86ISD::VPCOMU : X86ISD::VPCOM;
17143 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17144 DAG.getConstant(CmpMode, dl, MVT::i8));
17147 // We are handling one of the integer comparisons here. Since SSE only has
17148 // GT and EQ comparisons for integer, swapping operands and multiple
17149 // operations may be required for some comparisons.
17151 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
17152 bool Subus = false;
17154 switch (SetCCOpcode) {
17155 default: llvm_unreachable("Unexpected SETCC condition");
17156 case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH;
17157 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
17158 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
17159 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
17160 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
17161 case ISD::SETLE: Opc = X86ISD::PCMPGT;
17162 Invert = true; break;
17163 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17164 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
17165 FlipSigns = true; break;
17166 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
17167 case ISD::SETULE: Opc = X86ISD::PCMPGT;
17168 FlipSigns = true; Invert = true; break;
17171 // Special case: Use min/max operations for SETULE/SETUGE
17172 MVT VET = VT.getVectorElementType();
17174 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
17175 || (Subtarget.hasSSE2() && (VET == MVT::i8));
17178 switch (SetCCOpcode) {
17180 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17181 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17184 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
17187 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17188 if (!MinMax && hasSubus) {
17189 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17191 // t = psubus Op0, Op1
17192 // pcmpeq t, <0..0>
17193 switch (SetCCOpcode) {
17195 case ISD::SETULT: {
17196 // If the comparison is against a constant we can turn this into a
17197 // setule. With psubus, setule does not require a swap. This is
17198 // beneficial because the constant in the register is no longer
17199 // destructed as the destination so it can be hoisted out of a loop.
17200 // Only do this pre-AVX since vpcmp* is no longer destructive.
17201 if (Subtarget.hasAVX())
17203 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17205 Subus = true; Invert = false; Swap = false;
17209 // Psubus is better than flip-sign because it requires no inversion.
17210 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17211 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17215 Opc = X86ISD::SUBUS;
17221 std::swap(Op0, Op1);
17223 // Check that the operation in question is available (most are plain SSE2,
17224 // but PCMPGTQ and PCMPEQQ have different requirements).
17225 if (VT == MVT::v2i64) {
17226 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17227 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17229 // First cast everything to the right type.
17230 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17231 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17233 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17234 // bits of the inputs before performing those operations. The lower
17235 // compare is always unsigned.
17238 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17240 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17241 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17242 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17244 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17245 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17247 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17248 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17249 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17251 // Create masks for only the low parts/high parts of the 64 bit integers.
17252 static const int MaskHi[] = { 1, 1, 3, 3 };
17253 static const int MaskLo[] = { 0, 0, 2, 2 };
17254 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17255 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17256 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17258 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17259 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17262 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17264 return DAG.getBitcast(VT, Result);
17267 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17268 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17269 // pcmpeqd + pshufd + pand.
17270 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17272 // First cast everything to the right type.
17273 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17274 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17277 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17279 // Make sure the lower and upper halves are both all-ones.
17280 static const int Mask[] = { 1, 0, 3, 2 };
17281 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17282 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17285 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17287 return DAG.getBitcast(VT, Result);
17291 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17292 // bits of the inputs before performing those operations.
17294 MVT EltVT = VT.getVectorElementType();
17295 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17297 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17298 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17301 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17303 // If the logical-not of the result is required, perform that now.
17305 Result = DAG.getNOT(dl, Result, VT);
17308 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17311 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17312 getZeroVector(VT, Subtarget, DAG, dl));
17317 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17319 MVT VT = Op.getSimpleValueType();
17321 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17323 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
17324 SDValue Op0 = Op.getOperand(0);
17325 SDValue Op1 = Op.getOperand(1);
17327 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17329 // Optimize to BT if possible.
17330 // Lower (X & (1 << N)) == 0 to BT(X, N).
17331 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17332 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17333 // Lower (trunc (X >> N) to i1) to BT(X, N).
17334 if (Op0.hasOneUse() && isNullConstant(Op1) &&
17335 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17336 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17338 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17343 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17345 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17346 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17348 // If the input is a setcc, then reuse the input setcc or use a new one with
17349 // the inverted condition.
17350 if (Op0.getOpcode() == X86ISD::SETCC) {
17351 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17352 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17356 CCode = X86::GetOppositeBranchCondition(CCode);
17357 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17359 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17363 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17364 if (isOneConstant(Op1)) {
17365 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17366 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17368 if (!isNullConstant(Op1)) {
17369 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17370 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17374 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17375 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17376 if (X86CC == X86::COND_INVALID)
17379 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17380 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17381 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17383 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17387 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
17388 SDValue LHS = Op.getOperand(0);
17389 SDValue RHS = Op.getOperand(1);
17390 SDValue Carry = Op.getOperand(2);
17391 SDValue Cond = Op.getOperand(3);
17394 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
17395 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17397 // Recreate the carry if needed.
17398 EVT CarryVT = Carry.getValueType();
17399 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
17400 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
17401 Carry, DAG.getConstant(NegOne, DL, CarryVT));
17403 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17404 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
17405 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17406 if (Op.getSimpleValueType() == MVT::i1)
17407 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17411 /// Return true if opcode is a X86 logical comparison.
17412 static bool isX86LogicalCmp(SDValue Op) {
17413 unsigned Opc = Op.getOpcode();
17414 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17415 Opc == X86ISD::SAHF)
17417 if (Op.getResNo() == 1 &&
17418 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17419 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17420 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17421 Opc == X86ISD::XOR || Opc == X86ISD::AND))
17424 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17430 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17431 if (V.getOpcode() != ISD::TRUNCATE)
17434 SDValue VOp0 = V.getOperand(0);
17435 unsigned InBits = VOp0.getValueSizeInBits();
17436 unsigned Bits = V.getValueSizeInBits();
17437 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17440 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17441 bool AddTest = true;
17442 SDValue Cond = Op.getOperand(0);
17443 SDValue Op1 = Op.getOperand(1);
17444 SDValue Op2 = Op.getOperand(2);
17446 MVT VT = Op1.getSimpleValueType();
17449 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17450 // are available or VBLENDV if AVX is available.
17451 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17452 if (Cond.getOpcode() == ISD::SETCC &&
17453 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17454 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17455 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17456 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17457 int SSECC = translateX86FSETCC(
17458 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17461 if (Subtarget.hasAVX512()) {
17462 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
17463 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17464 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17465 DL, VT, Cmp, Op1, Op2);
17468 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17469 DAG.getConstant(SSECC, DL, MVT::i8));
17471 // If we have AVX, we can use a variable vector select (VBLENDV) instead
17472 // of 3 logic instructions for size savings and potentially speed.
17473 // Unfortunately, there is no scalar form of VBLENDV.
17475 // If either operand is a constant, don't try this. We can expect to
17476 // optimize away at least one of the logic instructions later in that
17477 // case, so that sequence would be faster than a variable blend.
17479 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17480 // uses XMM0 as the selection register. That may need just as many
17481 // instructions as the AND/ANDN/OR sequence due to register moves, so
17484 if (Subtarget.hasAVX() &&
17485 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17487 // Convert to vectors, do a VSELECT, and convert back to scalar.
17488 // All of the conversions should be optimized away.
17490 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17491 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17492 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17493 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17495 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17496 VCmp = DAG.getBitcast(VCmpVT, VCmp);
17498 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
17500 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17501 VSel, DAG.getIntPtrConstant(0, DL));
17503 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17504 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17505 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17509 // AVX512 fallback is to lower selects of scalar floats to masked moves.
17510 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
17511 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
17512 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17515 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17517 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17518 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17519 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17520 Op1Scalar = Op1.getOperand(0);
17522 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17523 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17524 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17525 Op2Scalar = Op2.getOperand(0);
17526 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17527 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
17528 Op1Scalar, Op2Scalar);
17529 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17530 return DAG.getBitcast(VT, newSelect);
17531 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17532 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17533 DAG.getIntPtrConstant(0, DL));
17537 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17538 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17539 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17540 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17541 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17542 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17543 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
17544 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17547 if (Cond.getOpcode() == ISD::SETCC) {
17548 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17550 // If the condition was updated, it's possible that the operands of the
17551 // select were also updated (for example, EmitTest has a RAUW). Refresh
17552 // the local references to the select operands in case they got stale.
17553 Op1 = Op.getOperand(1);
17554 Op2 = Op.getOperand(2);
17558 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17559 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17560 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17561 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17562 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17563 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17564 if (Cond.getOpcode() == X86ISD::SETCC &&
17565 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17566 isNullConstant(Cond.getOperand(1).getOperand(1))) {
17567 SDValue Cmp = Cond.getOperand(1);
17568 unsigned CondCode =
17569 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17571 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17572 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17573 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17575 SDValue CmpOp0 = Cmp.getOperand(0);
17576 // Apply further optimizations for special cases
17577 // (select (x != 0), -1, 0) -> neg & sbb
17578 // (select (x == 0), 0, -1) -> neg & sbb
17579 if (isNullConstant(Y) &&
17580 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17581 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17582 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
17583 DAG.getConstant(0, DL,
17584 CmpOp0.getValueType()),
17586 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17587 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17588 SDValue(Neg.getNode(), 1));
17592 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17593 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17594 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17596 SDValue Res = // Res = 0 or -1.
17597 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17598 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17600 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17601 Res = DAG.getNOT(DL, Res, Res.getValueType());
17603 if (!isNullConstant(Op2))
17604 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17606 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17607 Cmp.getOperand(0).getOpcode() == ISD::AND &&
17608 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17609 SDValue CmpOp0 = Cmp.getOperand(0);
17610 SDValue Src1, Src2;
17611 // true if Op2 is XOR or OR operator and one of its operands
17613 // ( a , a op b) || ( b , a op b)
17614 auto isOrXorPattern = [&]() {
17615 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
17616 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
17618 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17625 if (isOrXorPattern()) {
17627 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17628 // we need mask of all zeros or ones with same size of the other
17630 if (CmpSz > VT.getSizeInBits())
17631 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17632 else if (CmpSz < VT.getSizeInBits())
17633 Neg = DAG.getNode(ISD::AND, DL, VT,
17634 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17635 DAG.getConstant(1, DL, VT));
17638 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17639 Neg); // -(and (x, 0x1))
17640 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17641 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
17646 // Look past (and (setcc_carry (cmp ...)), 1).
17647 if (Cond.getOpcode() == ISD::AND &&
17648 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17649 isOneConstant(Cond.getOperand(1)))
17650 Cond = Cond.getOperand(0);
17652 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17653 // setting operand in place of the X86ISD::SETCC.
17654 unsigned CondOpcode = Cond.getOpcode();
17655 if (CondOpcode == X86ISD::SETCC ||
17656 CondOpcode == X86ISD::SETCC_CARRY) {
17657 CC = Cond.getOperand(0);
17659 SDValue Cmp = Cond.getOperand(1);
17660 unsigned Opc = Cmp.getOpcode();
17661 MVT VT = Op.getSimpleValueType();
17663 bool IllegalFPCMov = false;
17664 if (VT.isFloatingPoint() && !VT.isVector() &&
17665 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17666 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17668 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17669 Opc == X86ISD::BT) { // FIXME
17673 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17674 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17675 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17676 Cond.getOperand(0).getValueType() != MVT::i8)) {
17677 SDValue LHS = Cond.getOperand(0);
17678 SDValue RHS = Cond.getOperand(1);
17679 unsigned X86Opcode;
17682 switch (CondOpcode) {
17683 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17684 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17685 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17686 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17687 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17688 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17689 default: llvm_unreachable("unexpected overflowing operator");
17691 if (CondOpcode == ISD::UMULO)
17692 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17695 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17697 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17699 if (CondOpcode == ISD::UMULO)
17700 Cond = X86Op.getValue(2);
17702 Cond = X86Op.getValue(1);
17704 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17709 // Look past the truncate if the high bits are known zero.
17710 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17711 Cond = Cond.getOperand(0);
17713 // We know the result of AND is compared against zero. Try to match
17715 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17716 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17717 CC = NewSetCC.getOperand(0);
17718 Cond = NewSetCC.getOperand(1);
17725 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17726 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17729 // a < b ? -1 : 0 -> RES = ~setcc_carry
17730 // a < b ? 0 : -1 -> RES = setcc_carry
17731 // a >= b ? -1 : 0 -> RES = setcc_carry
17732 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17733 if (Cond.getOpcode() == X86ISD::SUB) {
17734 Cond = ConvertCmpIfNecessary(Cond, DAG);
17735 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17737 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17738 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17739 (isNullConstant(Op1) || isNullConstant(Op2))) {
17740 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17741 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17743 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17744 return DAG.getNOT(DL, Res, Res.getValueType());
17749 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17750 // widen the cmov and push the truncate through. This avoids introducing a new
17751 // branch during isel and doesn't add any extensions.
17752 if (Op.getValueType() == MVT::i8 &&
17753 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17754 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17755 if (T1.getValueType() == T2.getValueType() &&
17756 // Blacklist CopyFromReg to avoid partial register stalls.
17757 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17758 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17759 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17760 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17764 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17765 // condition is true.
17766 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17767 SDValue Ops[] = { Op2, Op1, CC, Cond };
17768 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17771 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17772 const X86Subtarget &Subtarget,
17773 SelectionDAG &DAG) {
17774 MVT VT = Op->getSimpleValueType(0);
17775 SDValue In = Op->getOperand(0);
17776 MVT InVT = In.getSimpleValueType();
17777 MVT VTElt = VT.getVectorElementType();
17778 MVT InVTElt = InVT.getVectorElementType();
17782 if ((InVTElt == MVT::i1) &&
17783 (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
17785 ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
17787 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17789 unsigned NumElts = VT.getVectorNumElements();
17791 if (VT.is512BitVector() && InVTElt != MVT::i1 &&
17792 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
17793 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17794 return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
17795 return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
17798 if (InVTElt != MVT::i1)
17802 if (!VT.is512BitVector() && !Subtarget.hasVLX())
17803 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17806 if (Subtarget.hasDQI()) {
17807 V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
17808 assert(!VT.is512BitVector() && "Unexpected vector type");
17810 SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
17811 SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
17812 V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
17817 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17820 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17821 // For sign extend this needs to handle all vector sizes and SSE4.1 and
17822 // non-SSE4.1 targets. For zero extend this should only handle inputs of
17823 // MVT::v64i8 when BWI is not supported, but AVX512 is.
17824 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17825 const X86Subtarget &Subtarget,
17826 SelectionDAG &DAG) {
17827 SDValue In = Op->getOperand(0);
17828 MVT VT = Op->getSimpleValueType(0);
17829 MVT InVT = In.getSimpleValueType();
17830 assert(VT.getSizeInBits() == InVT.getSizeInBits());
17832 MVT SVT = VT.getVectorElementType();
17833 MVT InSVT = InVT.getVectorElementType();
17834 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17836 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17838 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17840 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17841 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
17842 !(VT.is512BitVector() && Subtarget.hasAVX512()))
17847 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17848 // For 512-bit vectors, we need 128-bits or 256-bits.
17849 if (VT.getSizeInBits() > 128) {
17850 // Input needs to be at least the same number of elements as output, and
17851 // at least 128-bits.
17852 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17853 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17856 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17857 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17859 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
17860 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
17861 // need to be handled here for 256/512-bit results.
17862 if (Subtarget.hasInt256()) {
17863 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
17864 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17865 X86ISD::VSEXT : X86ISD::VZEXT;
17866 return DAG.getNode(ExtOpc, dl, VT, In);
17869 // We should only get here for sign extend.
17870 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17871 "Unexpected opcode!");
17873 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
17877 // As SRAI is only available on i16/i32 types, we expand only up to i32
17878 // and handle i64 separately.
17879 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
17880 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
17881 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
17882 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
17883 Curr = DAG.getBitcast(CurrVT, Curr);
17886 SDValue SignExt = Curr;
17887 if (CurrVT != InVT) {
17888 unsigned SignExtShift =
17889 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
17890 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17891 DAG.getConstant(SignExtShift, dl, MVT::i8));
17897 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
17898 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17899 DAG.getConstant(31, dl, MVT::i8));
17900 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
17901 return DAG.getBitcast(VT, Ext);
17907 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17908 SelectionDAG &DAG) {
17909 MVT VT = Op->getSimpleValueType(0);
17910 SDValue In = Op->getOperand(0);
17911 MVT InVT = In.getSimpleValueType();
17914 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
17915 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
17917 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
17918 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
17919 (VT != MVT::v16i16 || InVT != MVT::v16i8))
17922 if (Subtarget.hasInt256())
17923 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17925 // Optimize vectors in AVX mode
17926 // Sign extend v8i16 to v8i32 and
17929 // Divide input vector into two parts
17930 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
17931 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
17932 // concat the vectors to original VT
17934 unsigned NumElems = InVT.getVectorNumElements();
17935 SDValue Undef = DAG.getUNDEF(InVT);
17937 SmallVector<int,8> ShufMask1(NumElems, -1);
17938 for (unsigned i = 0; i != NumElems/2; ++i)
17941 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
17943 SmallVector<int,8> ShufMask2(NumElems, -1);
17944 for (unsigned i = 0; i != NumElems/2; ++i)
17945 ShufMask2[i] = i + NumElems/2;
17947 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
17949 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
17950 VT.getVectorNumElements() / 2);
17952 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
17953 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
17955 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
17958 // Lower truncating store. We need a special lowering to vXi1 vectors
17959 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
17960 SelectionDAG &DAG) {
17961 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
17963 EVT MemVT = St->getMemoryVT();
17964 assert(St->isTruncatingStore() && "We only custom truncating store.");
17965 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
17966 "Expected truncstore of i1 vector");
17968 SDValue Op = St->getValue();
17969 MVT OpVT = Op.getValueType().getSimpleVT();
17970 unsigned NumElts = OpVT.getVectorNumElements();
17971 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17973 // Truncate and store - everything is legal
17974 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
17975 if (MemVT.getSizeInBits() < 8)
17976 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
17977 DAG.getUNDEF(MVT::v8i1), Op,
17978 DAG.getIntPtrConstant(0, dl));
17979 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17980 St->getMemOperand());
17983 // A subset, assume that we have only AVX-512F
17984 if (NumElts <= 8) {
17986 // Extend to 8-elts vector
17987 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
17988 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
17989 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
17991 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
17992 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17993 St->getMemOperand());
17996 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
17997 // Divide the vector into 2 parts and store each part separately
17998 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17999 DAG.getIntPtrConstant(0, dl));
18000 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18001 SDValue BasePtr = St->getBasePtr();
18002 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18003 St->getMemOperand());
18004 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18005 DAG.getIntPtrConstant(16, dl));
18006 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18008 SDValue BasePtrHi =
18009 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18010 DAG.getConstant(2, dl, BasePtr.getValueType()));
18012 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18013 BasePtrHi, St->getMemOperand());
18014 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18017 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18018 const X86Subtarget &Subtarget,
18019 SelectionDAG &DAG) {
18021 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18023 EVT MemVT = Ld->getMemoryVT();
18024 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18025 "Expected i1 vector load");
18026 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18027 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18028 MVT VT = Op.getValueType().getSimpleVT();
18029 unsigned NumElts = VT.getVectorNumElements();
18031 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18032 (Subtarget.hasDQI() && NumElts < 16) ||
18034 // Load and extend - everything is legal
18036 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18038 Ld->getMemOperand());
18039 // Replace chain users with the new chain.
18040 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18041 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18042 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18043 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18045 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18046 DAG.getIntPtrConstant(0, dl));
18048 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18050 Ld->getMemOperand());
18051 // Replace chain users with the new chain.
18052 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18053 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18055 // Finally, do a normal sign-extend to the desired register.
18056 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18059 if (NumElts <= 8) {
18060 // A subset, assume that we have only AVX-512F
18061 unsigned NumBitsToLoad = 8;
18062 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18063 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18065 Ld->getMemOperand());
18066 // Replace chain users with the new chain.
18067 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18068 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18070 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18071 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18074 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18076 // we should take care to v4i1 and v2i1
18078 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18079 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18080 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18081 DAG.getIntPtrConstant(0, dl));
18084 assert(VT == MVT::v32i8 && "Unexpected extload type");
18086 SmallVector<SDValue, 2> Chains;
18088 SDValue BasePtr = Ld->getBasePtr();
18089 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18091 Ld->getMemOperand());
18092 Chains.push_back(LoadLo.getValue(1));
18094 SDValue BasePtrHi =
18095 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18096 DAG.getConstant(2, dl, BasePtr.getValueType()));
18098 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18100 Ld->getMemOperand());
18101 Chains.push_back(LoadHi.getValue(1));
18102 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18103 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18105 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18106 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18107 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18110 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18111 // may emit an illegal shuffle but the expansion is still better than scalar
18112 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18113 // we'll emit a shuffle and a arithmetic shift.
18114 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18115 // TODO: It is possible to support ZExt by zeroing the undef values during
18116 // the shuffle phase or after the shuffle.
18117 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18118 SelectionDAG &DAG) {
18119 MVT RegVT = Op.getSimpleValueType();
18120 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18121 assert(RegVT.isInteger() &&
18122 "We only custom lower integer vector sext loads.");
18124 // Nothing useful we can do without SSE2 shuffles.
18125 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18127 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18129 EVT MemVT = Ld->getMemoryVT();
18130 if (MemVT.getScalarType() == MVT::i1)
18131 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18133 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18134 unsigned RegSz = RegVT.getSizeInBits();
18136 ISD::LoadExtType Ext = Ld->getExtensionType();
18138 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18139 && "Only anyext and sext are currently implemented.");
18140 assert(MemVT != RegVT && "Cannot extend to the same type");
18141 assert(MemVT.isVector() && "Must load a vector from memory");
18143 unsigned NumElems = RegVT.getVectorNumElements();
18144 unsigned MemSz = MemVT.getSizeInBits();
18145 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18147 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18148 // The only way in which we have a legal 256-bit vector result but not the
18149 // integer 256-bit operations needed to directly lower a sextload is if we
18150 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18151 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18152 // correctly legalized. We do this late to allow the canonical form of
18153 // sextload to persist throughout the rest of the DAG combiner -- it wants
18154 // to fold together any extensions it can, and so will fuse a sign_extend
18155 // of an sextload into a sextload targeting a wider value.
18157 if (MemSz == 128) {
18158 // Just switch this to a normal load.
18159 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18160 "it must be a legal 128-bit vector "
18162 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18163 Ld->getPointerInfo(), Ld->getAlignment(),
18164 Ld->getMemOperand()->getFlags());
18166 assert(MemSz < 128 &&
18167 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18168 // Do an sext load to a 128-bit vector type. We want to use the same
18169 // number of elements, but elements half as wide. This will end up being
18170 // recursively lowered by this routine, but will succeed as we definitely
18171 // have all the necessary features if we're using AVX1.
18173 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18174 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18176 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18177 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18178 Ld->getMemOperand()->getFlags());
18181 // Replace chain users with the new chain.
18182 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18183 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18185 // Finally, do a normal sign-extend to the desired register.
18186 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18189 // All sizes must be a power of two.
18190 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18191 "Non-power-of-two elements are not custom lowered!");
18193 // Attempt to load the original value using scalar loads.
18194 // Find the largest scalar type that divides the total loaded size.
18195 MVT SclrLoadTy = MVT::i8;
18196 for (MVT Tp : MVT::integer_valuetypes()) {
18197 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18202 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18203 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18205 SclrLoadTy = MVT::f64;
18207 // Calculate the number of scalar loads that we need to perform
18208 // in order to load our vector from memory.
18209 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18211 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18212 "Can only lower sext loads with a single scalar load!");
18214 unsigned loadRegZize = RegSz;
18215 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18218 // Represent our vector as a sequence of elements which are the
18219 // largest scalar that we can load.
18220 EVT LoadUnitVecVT = EVT::getVectorVT(
18221 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18223 // Represent the data using the same element type that is stored in
18224 // memory. In practice, we ''widen'' MemVT.
18226 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18227 loadRegZize / MemVT.getScalarSizeInBits());
18229 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18230 "Invalid vector type");
18232 // We can't shuffle using an illegal type.
18233 assert(TLI.isTypeLegal(WideVecVT) &&
18234 "We only lower types that form legal widened vector types");
18236 SmallVector<SDValue, 8> Chains;
18237 SDValue Ptr = Ld->getBasePtr();
18238 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18239 TLI.getPointerTy(DAG.getDataLayout()));
18240 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18242 for (unsigned i = 0; i < NumLoads; ++i) {
18243 // Perform a single load.
18244 SDValue ScalarLoad =
18245 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18246 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18247 Chains.push_back(ScalarLoad.getValue(1));
18248 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18249 // another round of DAGCombining.
18251 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18253 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18254 ScalarLoad, DAG.getIntPtrConstant(i, dl));
18256 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18259 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18261 // Bitcast the loaded value to a vector of the original element type, in
18262 // the size of the target vector type.
18263 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18264 unsigned SizeRatio = RegSz / MemSz;
18266 if (Ext == ISD::SEXTLOAD) {
18267 // If we have SSE4.1, we can directly emit a VSEXT node.
18268 if (Subtarget.hasSSE41()) {
18269 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18270 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18274 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18276 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18277 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18279 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18280 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18284 // Redistribute the loaded elements into the different locations.
18285 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18286 for (unsigned i = 0; i != NumElems; ++i)
18287 ShuffleVec[i * SizeRatio] = i;
18289 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18290 DAG.getUNDEF(WideVecVT), ShuffleVec);
18292 // Bitcast to the requested type.
18293 Shuff = DAG.getBitcast(RegVT, Shuff);
18294 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18298 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18299 /// each of which has no other use apart from the AND / OR.
18300 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18301 Opc = Op.getOpcode();
18302 if (Opc != ISD::OR && Opc != ISD::AND)
18304 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18305 Op.getOperand(0).hasOneUse() &&
18306 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18307 Op.getOperand(1).hasOneUse());
18310 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18311 /// SETCC node has a single use.
18312 static bool isXor1OfSetCC(SDValue Op) {
18313 if (Op.getOpcode() != ISD::XOR)
18315 if (isOneConstant(Op.getOperand(1)))
18316 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18317 Op.getOperand(0).hasOneUse();
18321 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18322 bool addTest = true;
18323 SDValue Chain = Op.getOperand(0);
18324 SDValue Cond = Op.getOperand(1);
18325 SDValue Dest = Op.getOperand(2);
18328 bool Inverted = false;
18330 if (Cond.getOpcode() == ISD::SETCC) {
18331 // Check for setcc([su]{add,sub,mul}o == 0).
18332 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18333 isNullConstant(Cond.getOperand(1)) &&
18334 Cond.getOperand(0).getResNo() == 1 &&
18335 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18336 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18337 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18338 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18339 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18340 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18342 Cond = Cond.getOperand(0);
18344 if (SDValue NewCond = LowerSETCC(Cond, DAG))
18349 // FIXME: LowerXALUO doesn't handle these!!
18350 else if (Cond.getOpcode() == X86ISD::ADD ||
18351 Cond.getOpcode() == X86ISD::SUB ||
18352 Cond.getOpcode() == X86ISD::SMUL ||
18353 Cond.getOpcode() == X86ISD::UMUL)
18354 Cond = LowerXALUO(Cond, DAG);
18357 // Look pass (and (setcc_carry (cmp ...)), 1).
18358 if (Cond.getOpcode() == ISD::AND &&
18359 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18360 isOneConstant(Cond.getOperand(1)))
18361 Cond = Cond.getOperand(0);
18363 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18364 // setting operand in place of the X86ISD::SETCC.
18365 unsigned CondOpcode = Cond.getOpcode();
18366 if (CondOpcode == X86ISD::SETCC ||
18367 CondOpcode == X86ISD::SETCC_CARRY) {
18368 CC = Cond.getOperand(0);
18370 SDValue Cmp = Cond.getOperand(1);
18371 unsigned Opc = Cmp.getOpcode();
18372 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18373 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18377 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18381 // These can only come from an arithmetic instruction with overflow,
18382 // e.g. SADDO, UADDO.
18383 Cond = Cond.getOperand(1);
18389 CondOpcode = Cond.getOpcode();
18390 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18391 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18392 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18393 Cond.getOperand(0).getValueType() != MVT::i8)) {
18394 SDValue LHS = Cond.getOperand(0);
18395 SDValue RHS = Cond.getOperand(1);
18396 unsigned X86Opcode;
18399 // Keep this in sync with LowerXALUO, otherwise we might create redundant
18400 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18402 switch (CondOpcode) {
18403 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18405 if (isOneConstant(RHS)) {
18406 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18409 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18410 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18412 if (isOneConstant(RHS)) {
18413 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18416 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18417 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18418 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18419 default: llvm_unreachable("unexpected overflowing operator");
18422 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18423 if (CondOpcode == ISD::UMULO)
18424 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18427 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18429 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18431 if (CondOpcode == ISD::UMULO)
18432 Cond = X86Op.getValue(2);
18434 Cond = X86Op.getValue(1);
18436 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18440 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18441 SDValue Cmp = Cond.getOperand(0).getOperand(1);
18442 if (CondOpc == ISD::OR) {
18443 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18444 // two branches instead of an explicit OR instruction with a
18446 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18447 isX86LogicalCmp(Cmp)) {
18448 CC = Cond.getOperand(0).getOperand(0);
18449 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18450 Chain, Dest, CC, Cmp);
18451 CC = Cond.getOperand(1).getOperand(0);
18455 } else { // ISD::AND
18456 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18457 // two branches instead of an explicit AND instruction with a
18458 // separate test. However, we only do this if this block doesn't
18459 // have a fall-through edge, because this requires an explicit
18460 // jmp when the condition is false.
18461 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18462 isX86LogicalCmp(Cmp) &&
18463 Op.getNode()->hasOneUse()) {
18464 X86::CondCode CCode =
18465 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18466 CCode = X86::GetOppositeBranchCondition(CCode);
18467 CC = DAG.getConstant(CCode, dl, MVT::i8);
18468 SDNode *User = *Op.getNode()->use_begin();
18469 // Look for an unconditional branch following this conditional branch.
18470 // We need this because we need to reverse the successors in order
18471 // to implement FCMP_OEQ.
18472 if (User->getOpcode() == ISD::BR) {
18473 SDValue FalseBB = User->getOperand(1);
18475 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18476 assert(NewBR == User);
18480 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18481 Chain, Dest, CC, Cmp);
18482 X86::CondCode CCode =
18483 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18484 CCode = X86::GetOppositeBranchCondition(CCode);
18485 CC = DAG.getConstant(CCode, dl, MVT::i8);
18491 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18492 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18493 // It should be transformed during dag combiner except when the condition
18494 // is set by a arithmetics with overflow node.
18495 X86::CondCode CCode =
18496 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18497 CCode = X86::GetOppositeBranchCondition(CCode);
18498 CC = DAG.getConstant(CCode, dl, MVT::i8);
18499 Cond = Cond.getOperand(0).getOperand(1);
18501 } else if (Cond.getOpcode() == ISD::SETCC &&
18502 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18503 // For FCMP_OEQ, we can emit
18504 // two branches instead of an explicit AND instruction with a
18505 // separate test. However, we only do this if this block doesn't
18506 // have a fall-through edge, because this requires an explicit
18507 // jmp when the condition is false.
18508 if (Op.getNode()->hasOneUse()) {
18509 SDNode *User = *Op.getNode()->use_begin();
18510 // Look for an unconditional branch following this conditional branch.
18511 // We need this because we need to reverse the successors in order
18512 // to implement FCMP_OEQ.
18513 if (User->getOpcode() == ISD::BR) {
18514 SDValue FalseBB = User->getOperand(1);
18516 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18517 assert(NewBR == User);
18521 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18522 Cond.getOperand(0), Cond.getOperand(1));
18523 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18524 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18525 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18526 Chain, Dest, CC, Cmp);
18527 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18532 } else if (Cond.getOpcode() == ISD::SETCC &&
18533 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18534 // For FCMP_UNE, we can emit
18535 // two branches instead of an explicit AND instruction with a
18536 // separate test. However, we only do this if this block doesn't
18537 // have a fall-through edge, because this requires an explicit
18538 // jmp when the condition is false.
18539 if (Op.getNode()->hasOneUse()) {
18540 SDNode *User = *Op.getNode()->use_begin();
18541 // Look for an unconditional branch following this conditional branch.
18542 // We need this because we need to reverse the successors in order
18543 // to implement FCMP_UNE.
18544 if (User->getOpcode() == ISD::BR) {
18545 SDValue FalseBB = User->getOperand(1);
18547 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18548 assert(NewBR == User);
18551 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18552 Cond.getOperand(0), Cond.getOperand(1));
18553 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18554 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18555 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18556 Chain, Dest, CC, Cmp);
18557 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18567 // Look pass the truncate if the high bits are known zero.
18568 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18569 Cond = Cond.getOperand(0);
18571 // We know the result is compared against zero. Try to match it to BT.
18572 if (Cond.hasOneUse()) {
18573 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18574 CC = NewSetCC.getOperand(0);
18575 Cond = NewSetCC.getOperand(1);
18582 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18583 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18584 Cond = EmitTest(Cond, X86Cond, dl, DAG);
18586 Cond = ConvertCmpIfNecessary(Cond, DAG);
18587 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18588 Chain, Dest, CC, Cond);
18591 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18592 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18593 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18594 // that the guard pages used by the OS virtual memory manager are allocated in
18595 // correct sequence.
18597 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18598 SelectionDAG &DAG) const {
18599 MachineFunction &MF = DAG.getMachineFunction();
18600 bool SplitStack = MF.shouldSplitStack();
18601 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18606 SDNode *Node = Op.getNode();
18607 SDValue Chain = Op.getOperand(0);
18608 SDValue Size = Op.getOperand(1);
18609 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18610 EVT VT = Node->getValueType(0);
18612 // Chain the dynamic stack allocation so that it doesn't modify the stack
18613 // pointer when other instructions are using the stack.
18614 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
18616 bool Is64Bit = Subtarget.is64Bit();
18617 MVT SPTy = getPointerTy(DAG.getDataLayout());
18621 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18622 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18623 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18624 " not tell us which reg is the stack pointer!");
18626 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18627 Chain = SP.getValue(1);
18628 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18629 unsigned StackAlign = TFI.getStackAlignment();
18630 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18631 if (Align > StackAlign)
18632 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18633 DAG.getConstant(-(uint64_t)Align, dl, VT));
18634 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18635 } else if (SplitStack) {
18636 MachineRegisterInfo &MRI = MF.getRegInfo();
18639 // The 64 bit implementation of segmented stacks needs to clobber both r10
18640 // r11. This makes it impossible to use it along with nested parameters.
18641 const Function *F = MF.getFunction();
18642 for (const auto &A : F->args()) {
18643 if (A.hasNestAttr())
18644 report_fatal_error("Cannot use segmented stacks with functions that "
18645 "have nested arguments.");
18649 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18650 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18651 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18652 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18653 DAG.getRegister(Vreg, SPTy));
18655 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18656 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18657 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18659 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18660 unsigned SPReg = RegInfo->getStackRegister();
18661 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18662 Chain = SP.getValue(1);
18665 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18666 DAG.getConstant(-(uint64_t)Align, dl, VT));
18667 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18673 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18674 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18676 SDValue Ops[2] = {Result, Chain};
18677 return DAG.getMergeValues(Ops, dl);
18680 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18681 MachineFunction &MF = DAG.getMachineFunction();
18682 auto PtrVT = getPointerTy(MF.getDataLayout());
18683 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18685 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18688 if (!Subtarget.is64Bit() ||
18689 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18690 // vastart just stores the address of the VarArgsFrameIndex slot into the
18691 // memory location argument.
18692 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18693 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18694 MachinePointerInfo(SV));
18698 // gp_offset (0 - 6 * 8)
18699 // fp_offset (48 - 48 + 8 * 16)
18700 // overflow_arg_area (point to parameters coming in memory).
18702 SmallVector<SDValue, 8> MemOps;
18703 SDValue FIN = Op.getOperand(1);
18705 SDValue Store = DAG.getStore(
18706 Op.getOperand(0), DL,
18707 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18708 MachinePointerInfo(SV));
18709 MemOps.push_back(Store);
18712 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18713 Store = DAG.getStore(
18714 Op.getOperand(0), DL,
18715 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18716 MachinePointerInfo(SV, 4));
18717 MemOps.push_back(Store);
18719 // Store ptr to overflow_arg_area
18720 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18721 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18723 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18724 MemOps.push_back(Store);
18726 // Store ptr to reg_save_area.
18727 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18728 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18729 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18730 Store = DAG.getStore(
18731 Op.getOperand(0), DL, RSFIN, FIN,
18732 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18733 MemOps.push_back(Store);
18734 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18737 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18738 assert(Subtarget.is64Bit() &&
18739 "LowerVAARG only handles 64-bit va_arg!");
18740 assert(Op.getNumOperands() == 4);
18742 MachineFunction &MF = DAG.getMachineFunction();
18743 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18744 // The Win64 ABI uses char* instead of a structure.
18745 return DAG.expandVAArg(Op.getNode());
18747 SDValue Chain = Op.getOperand(0);
18748 SDValue SrcPtr = Op.getOperand(1);
18749 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18750 unsigned Align = Op.getConstantOperandVal(3);
18753 EVT ArgVT = Op.getNode()->getValueType(0);
18754 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18755 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18758 // Decide which area this value should be read from.
18759 // TODO: Implement the AMD64 ABI in its entirety. This simple
18760 // selection mechanism works only for the basic types.
18761 if (ArgVT == MVT::f80) {
18762 llvm_unreachable("va_arg for f80 not yet implemented");
18763 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18764 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18765 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18766 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18768 llvm_unreachable("Unhandled argument type in LowerVAARG");
18771 if (ArgMode == 2) {
18772 // Sanity Check: Make sure using fp_offset makes sense.
18773 assert(!Subtarget.useSoftFloat() &&
18774 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18775 Subtarget.hasSSE1());
18778 // Insert VAARG_64 node into the DAG
18779 // VAARG_64 returns two values: Variable Argument Address, Chain
18780 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18781 DAG.getConstant(ArgMode, dl, MVT::i8),
18782 DAG.getConstant(Align, dl, MVT::i32)};
18783 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18784 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18785 VTs, InstOps, MVT::i64,
18786 MachinePointerInfo(SV),
18788 /*Volatile=*/false,
18790 /*WriteMem=*/true);
18791 Chain = VAARG.getValue(1);
18793 // Load the next argument and return it
18794 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18797 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18798 SelectionDAG &DAG) {
18799 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18800 // where a va_list is still an i8*.
18801 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18802 if (Subtarget.isCallingConvWin64(
18803 DAG.getMachineFunction().getFunction()->getCallingConv()))
18804 // Probably a Win64 va_copy.
18805 return DAG.expandVACopy(Op.getNode());
18807 SDValue Chain = Op.getOperand(0);
18808 SDValue DstPtr = Op.getOperand(1);
18809 SDValue SrcPtr = Op.getOperand(2);
18810 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18811 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18814 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18815 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18817 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18820 /// Handle vector element shifts where the shift amount is a constant.
18821 /// Takes immediate version of shift as input.
18822 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18823 SDValue SrcOp, uint64_t ShiftAmt,
18824 SelectionDAG &DAG) {
18825 MVT ElementType = VT.getVectorElementType();
18827 // Bitcast the source vector to the output type, this is mainly necessary for
18828 // vXi8/vXi64 shifts.
18829 if (VT != SrcOp.getSimpleValueType())
18830 SrcOp = DAG.getBitcast(VT, SrcOp);
18832 // Fold this packed shift into its first operand if ShiftAmt is 0.
18836 // Check for ShiftAmt >= element width
18837 if (ShiftAmt >= ElementType.getSizeInBits()) {
18838 if (Opc == X86ISD::VSRAI)
18839 ShiftAmt = ElementType.getSizeInBits() - 1;
18841 return DAG.getConstant(0, dl, VT);
18844 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18845 && "Unknown target vector shift-by-constant node");
18847 // Fold this packed vector shift into a build vector if SrcOp is a
18848 // vector of Constants or UNDEFs.
18849 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18850 SmallVector<SDValue, 8> Elts;
18851 unsigned NumElts = SrcOp->getNumOperands();
18852 ConstantSDNode *ND;
18855 default: llvm_unreachable("Unknown opcode!");
18856 case X86ISD::VSHLI:
18857 for (unsigned i=0; i!=NumElts; ++i) {
18858 SDValue CurrentOp = SrcOp->getOperand(i);
18859 if (CurrentOp->isUndef()) {
18860 Elts.push_back(CurrentOp);
18863 ND = cast<ConstantSDNode>(CurrentOp);
18864 const APInt &C = ND->getAPIntValue();
18865 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18868 case X86ISD::VSRLI:
18869 for (unsigned i=0; i!=NumElts; ++i) {
18870 SDValue CurrentOp = SrcOp->getOperand(i);
18871 if (CurrentOp->isUndef()) {
18872 Elts.push_back(CurrentOp);
18875 ND = cast<ConstantSDNode>(CurrentOp);
18876 const APInt &C = ND->getAPIntValue();
18877 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
18880 case X86ISD::VSRAI:
18881 for (unsigned i=0; i!=NumElts; ++i) {
18882 SDValue CurrentOp = SrcOp->getOperand(i);
18883 if (CurrentOp->isUndef()) {
18884 Elts.push_back(CurrentOp);
18887 ND = cast<ConstantSDNode>(CurrentOp);
18888 const APInt &C = ND->getAPIntValue();
18889 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
18894 return DAG.getBuildVector(VT, dl, Elts);
18897 return DAG.getNode(Opc, dl, VT, SrcOp,
18898 DAG.getConstant(ShiftAmt, dl, MVT::i8));
18901 /// Handle vector element shifts where the shift amount may or may not be a
18902 /// constant. Takes immediate version of shift as input.
18903 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
18904 SDValue SrcOp, SDValue ShAmt,
18905 const X86Subtarget &Subtarget,
18906 SelectionDAG &DAG) {
18907 MVT SVT = ShAmt.getSimpleValueType();
18908 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
18910 // Catch shift-by-constant.
18911 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
18912 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
18913 CShAmt->getZExtValue(), DAG);
18915 // Change opcode to non-immediate version
18917 default: llvm_unreachable("Unknown target vector shift node");
18918 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
18919 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
18920 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
18923 // Need to build a vector containing shift amount.
18924 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
18925 // +=================+============+=======================================+
18926 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
18927 // +=================+============+=======================================+
18928 // | i64 | Yes, No | Use ShAmt as lowest elt |
18929 // | i32 | Yes | zero-extend in-reg |
18930 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
18931 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
18932 // +=================+============+=======================================+
18934 if (SVT == MVT::i64)
18935 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
18936 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
18937 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
18938 ShAmt = ShAmt.getOperand(0);
18939 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
18940 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18941 } else if (Subtarget.hasSSE41() &&
18942 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
18943 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
18944 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18946 SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
18947 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
18948 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
18951 // The return type has to be a 128-bit type with the same element
18952 // type as the input type.
18953 MVT EltVT = VT.getVectorElementType();
18954 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
18956 ShAmt = DAG.getBitcast(ShVT, ShAmt);
18957 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
18960 /// \brief Return Mask with the necessary casting or extending
18961 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
18962 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
18963 const X86Subtarget &Subtarget, SelectionDAG &DAG,
18966 if (isAllOnesConstant(Mask))
18967 return DAG.getTargetConstant(1, dl, MaskVT);
18968 if (X86::isZeroNode(Mask))
18969 return DAG.getTargetConstant(0, dl, MaskVT);
18971 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
18972 // Mask should be extended
18973 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
18974 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
18977 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
18978 if (MaskVT == MVT::v64i1) {
18979 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
18980 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
18982 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18983 DAG.getConstant(0, dl, MVT::i32));
18984 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18985 DAG.getConstant(1, dl, MVT::i32));
18987 Lo = DAG.getBitcast(MVT::v32i1, Lo);
18988 Hi = DAG.getBitcast(MVT::v32i1, Hi);
18990 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
18992 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
18994 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
18995 return DAG.getBitcast(MaskVT,
18996 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19000 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19001 Mask.getSimpleValueType().getSizeInBits());
19002 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19003 // are extracted by EXTRACT_SUBVECTOR.
19004 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19005 DAG.getBitcast(BitcastVT, Mask),
19006 DAG.getIntPtrConstant(0, dl));
19010 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19011 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19012 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19013 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19014 SDValue PreservedSrc,
19015 const X86Subtarget &Subtarget,
19016 SelectionDAG &DAG) {
19017 MVT VT = Op.getSimpleValueType();
19018 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19019 unsigned OpcodeSelect = ISD::VSELECT;
19022 if (isAllOnesConstant(Mask))
19025 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19027 switch (Op.getOpcode()) {
19029 case X86ISD::PCMPEQM:
19030 case X86ISD::PCMPGTM:
19032 case X86ISD::CMPMU:
19033 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19034 case X86ISD::VFPCLASS:
19035 case X86ISD::VFPCLASSS:
19036 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19037 case X86ISD::VTRUNC:
19038 case X86ISD::VTRUNCS:
19039 case X86ISD::VTRUNCUS:
19040 case X86ISD::CVTPS2PH:
19041 // We can't use ISD::VSELECT here because it is not always "Legal"
19042 // for the destination type. For example vpmovqb require only AVX512
19043 // and vselect that can operate on byte element type require BWI
19044 OpcodeSelect = X86ISD::SELECT;
19047 if (PreservedSrc.isUndef())
19048 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19049 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19052 /// \brief Creates an SDNode for a predicated scalar operation.
19053 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19054 /// The mask is coming as MVT::i8 and it should be transformed
19055 /// to MVT::v1i1 while lowering masking intrinsics.
19056 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19057 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19058 /// for a scalar instruction.
19059 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19060 SDValue PreservedSrc,
19061 const X86Subtarget &Subtarget,
19062 SelectionDAG &DAG) {
19064 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19065 if (MaskConst->getZExtValue() & 0x1)
19068 MVT VT = Op.getSimpleValueType();
19071 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19072 if (Op.getOpcode() == X86ISD::FSETCCM ||
19073 Op.getOpcode() == X86ISD::FSETCCM_RND)
19074 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19075 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19076 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19078 if (PreservedSrc.isUndef())
19079 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19080 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19083 static int getSEHRegistrationNodeSize(const Function *Fn) {
19084 if (!Fn->hasPersonalityFn())
19085 report_fatal_error(
19086 "querying registration node size for function without personality");
19087 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19088 // WinEHStatePass for the full struct definition.
19089 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19090 case EHPersonality::MSVC_X86SEH: return 24;
19091 case EHPersonality::MSVC_CXX: return 16;
19094 report_fatal_error(
19095 "can only recover FP for 32-bit MSVC EH personality functions");
19098 /// When the MSVC runtime transfers control to us, either to an outlined
19099 /// function or when returning to a parent frame after catching an exception, we
19100 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19101 /// Here's the math:
19102 /// RegNodeBase = EntryEBP - RegNodeSize
19103 /// ParentFP = RegNodeBase - ParentFrameOffset
19104 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19105 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19106 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19107 SDValue EntryEBP) {
19108 MachineFunction &MF = DAG.getMachineFunction();
19111 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19112 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19114 // It's possible that the parent function no longer has a personality function
19115 // if the exceptional code was optimized away, in which case we just return
19116 // the incoming EBP.
19117 if (!Fn->hasPersonalityFn())
19120 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19121 // registration, or the .set_setframe offset.
19122 MCSymbol *OffsetSym =
19123 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19124 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19125 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19126 SDValue ParentFrameOffset =
19127 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19129 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19130 // prologue to RBP in the parent function.
19131 const X86Subtarget &Subtarget =
19132 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19133 if (Subtarget.is64Bit())
19134 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19136 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19137 // RegNodeBase = EntryEBP - RegNodeSize
19138 // ParentFP = RegNodeBase - ParentFrameOffset
19139 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19140 DAG.getConstant(RegNodeSize, dl, PtrVT));
19141 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19144 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19145 SelectionDAG &DAG) {
19146 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19147 auto isRoundModeCurDirection = [](SDValue Rnd) {
19148 if (!isa<ConstantSDNode>(Rnd))
19151 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19152 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19156 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19157 MVT VT = Op.getSimpleValueType();
19158 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19160 switch(IntrData->Type) {
19161 case INTR_TYPE_1OP:
19162 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19163 case INTR_TYPE_2OP:
19164 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19166 case INTR_TYPE_3OP:
19167 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19168 Op.getOperand(2), Op.getOperand(3));
19169 case INTR_TYPE_4OP:
19170 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19171 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19172 case INTR_TYPE_1OP_MASK_RM: {
19173 SDValue Src = Op.getOperand(1);
19174 SDValue PassThru = Op.getOperand(2);
19175 SDValue Mask = Op.getOperand(3);
19176 SDValue RoundingMode;
19177 // We always add rounding mode to the Node.
19178 // If the rounding mode is not specified, we add the
19179 // "current direction" mode.
19180 if (Op.getNumOperands() == 4)
19182 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19184 RoundingMode = Op.getOperand(4);
19185 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19186 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19188 Mask, PassThru, Subtarget, DAG);
19190 case INTR_TYPE_1OP_MASK: {
19191 SDValue Src = Op.getOperand(1);
19192 SDValue PassThru = Op.getOperand(2);
19193 SDValue Mask = Op.getOperand(3);
19194 // We add rounding mode to the Node when
19195 // - RM Opcode is specified and
19196 // - RM is not "current direction".
19197 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19198 if (IntrWithRoundingModeOpcode != 0) {
19199 SDValue Rnd = Op.getOperand(4);
19200 if (!isRoundModeCurDirection(Rnd)) {
19201 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19202 dl, Op.getValueType(),
19204 Mask, PassThru, Subtarget, DAG);
19207 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19208 Mask, PassThru, Subtarget, DAG);
19210 case INTR_TYPE_SCALAR_MASK: {
19211 SDValue Src1 = Op.getOperand(1);
19212 SDValue Src2 = Op.getOperand(2);
19213 SDValue passThru = Op.getOperand(3);
19214 SDValue Mask = Op.getOperand(4);
19215 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19216 if (IntrWithRoundingModeOpcode != 0) {
19217 SDValue Rnd = Op.getOperand(5);
19218 if (!isRoundModeCurDirection(Rnd))
19219 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19220 dl, VT, Src1, Src2, Rnd),
19221 Mask, passThru, Subtarget, DAG);
19223 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19224 Mask, passThru, Subtarget, DAG);
19226 case INTR_TYPE_SCALAR_MASK_RM: {
19227 SDValue Src1 = Op.getOperand(1);
19228 SDValue Src2 = Op.getOperand(2);
19229 SDValue Src0 = Op.getOperand(3);
19230 SDValue Mask = Op.getOperand(4);
19231 // There are 2 kinds of intrinsics in this group:
19232 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19233 // (2) With rounding mode and sae - 7 operands.
19234 if (Op.getNumOperands() == 6) {
19235 SDValue Sae = Op.getOperand(5);
19236 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19238 Mask, Src0, Subtarget, DAG);
19240 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19241 SDValue RoundingMode = Op.getOperand(5);
19242 SDValue Sae = Op.getOperand(6);
19243 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19244 RoundingMode, Sae),
19245 Mask, Src0, Subtarget, DAG);
19247 case INTR_TYPE_2OP_MASK:
19248 case INTR_TYPE_2OP_IMM8_MASK: {
19249 SDValue Src1 = Op.getOperand(1);
19250 SDValue Src2 = Op.getOperand(2);
19251 SDValue PassThru = Op.getOperand(3);
19252 SDValue Mask = Op.getOperand(4);
19254 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19255 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19257 // We specify 2 possible opcodes for intrinsics with rounding modes.
19258 // First, we check if the intrinsic may have non-default rounding mode,
19259 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19260 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19261 if (IntrWithRoundingModeOpcode != 0) {
19262 SDValue Rnd = Op.getOperand(5);
19263 if (!isRoundModeCurDirection(Rnd)) {
19264 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19265 dl, Op.getValueType(),
19267 Mask, PassThru, Subtarget, DAG);
19270 // TODO: Intrinsics should have fast-math-flags to propagate.
19271 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19272 Mask, PassThru, Subtarget, DAG);
19274 case INTR_TYPE_2OP_MASK_RM: {
19275 SDValue Src1 = Op.getOperand(1);
19276 SDValue Src2 = Op.getOperand(2);
19277 SDValue PassThru = Op.getOperand(3);
19278 SDValue Mask = Op.getOperand(4);
19279 // We specify 2 possible modes for intrinsics, with/without rounding
19281 // First, we check if the intrinsic have rounding mode (6 operands),
19282 // if not, we set rounding mode to "current".
19284 if (Op.getNumOperands() == 6)
19285 Rnd = Op.getOperand(5);
19287 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19288 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19290 Mask, PassThru, Subtarget, DAG);
19292 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19293 SDValue Src1 = Op.getOperand(1);
19294 SDValue Src2 = Op.getOperand(2);
19295 SDValue Src3 = Op.getOperand(3);
19296 SDValue PassThru = Op.getOperand(4);
19297 SDValue Mask = Op.getOperand(5);
19298 SDValue Sae = Op.getOperand(6);
19300 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19302 Mask, PassThru, Subtarget, DAG);
19304 case INTR_TYPE_3OP_MASK_RM: {
19305 SDValue Src1 = Op.getOperand(1);
19306 SDValue Src2 = Op.getOperand(2);
19307 SDValue Imm = Op.getOperand(3);
19308 SDValue PassThru = Op.getOperand(4);
19309 SDValue Mask = Op.getOperand(5);
19310 // We specify 2 possible modes for intrinsics, with/without rounding
19312 // First, we check if the intrinsic have rounding mode (7 operands),
19313 // if not, we set rounding mode to "current".
19315 if (Op.getNumOperands() == 7)
19316 Rnd = Op.getOperand(6);
19318 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19319 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19320 Src1, Src2, Imm, Rnd),
19321 Mask, PassThru, Subtarget, DAG);
19323 case INTR_TYPE_3OP_IMM8_MASK:
19324 case INTR_TYPE_3OP_MASK: {
19325 SDValue Src1 = Op.getOperand(1);
19326 SDValue Src2 = Op.getOperand(2);
19327 SDValue Src3 = Op.getOperand(3);
19328 SDValue PassThru = Op.getOperand(4);
19329 SDValue Mask = Op.getOperand(5);
19331 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19332 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19334 // We specify 2 possible opcodes for intrinsics with rounding modes.
19335 // First, we check if the intrinsic may have non-default rounding mode,
19336 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19337 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19338 if (IntrWithRoundingModeOpcode != 0) {
19339 SDValue Rnd = Op.getOperand(6);
19340 if (!isRoundModeCurDirection(Rnd)) {
19341 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19342 dl, Op.getValueType(),
19343 Src1, Src2, Src3, Rnd),
19344 Mask, PassThru, Subtarget, DAG);
19347 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19349 Mask, PassThru, Subtarget, DAG);
19351 case VPERM_2OP_MASK : {
19352 SDValue Src1 = Op.getOperand(1);
19353 SDValue Src2 = Op.getOperand(2);
19354 SDValue PassThru = Op.getOperand(3);
19355 SDValue Mask = Op.getOperand(4);
19357 // Swap Src1 and Src2 in the node creation
19358 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19359 Mask, PassThru, Subtarget, DAG);
19361 case VPERM_3OP_MASKZ:
19362 case VPERM_3OP_MASK:{
19363 MVT VT = Op.getSimpleValueType();
19364 // Src2 is the PassThru
19365 SDValue Src1 = Op.getOperand(1);
19366 // PassThru needs to be the same type as the destination in order
19367 // to pattern match correctly.
19368 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19369 SDValue Src3 = Op.getOperand(3);
19370 SDValue Mask = Op.getOperand(4);
19371 SDValue PassThru = SDValue();
19373 // set PassThru element
19374 if (IntrData->Type == VPERM_3OP_MASKZ)
19375 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19379 // Swap Src1 and Src2 in the node creation
19380 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19381 dl, Op.getValueType(),
19383 Mask, PassThru, Subtarget, DAG);
19387 case FMA_OP_MASK: {
19388 SDValue Src1 = Op.getOperand(1);
19389 SDValue Src2 = Op.getOperand(2);
19390 SDValue Src3 = Op.getOperand(3);
19391 SDValue Mask = Op.getOperand(4);
19392 MVT VT = Op.getSimpleValueType();
19393 SDValue PassThru = SDValue();
19395 // set PassThru element
19396 if (IntrData->Type == FMA_OP_MASKZ)
19397 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19398 else if (IntrData->Type == FMA_OP_MASK3)
19403 // We specify 2 possible opcodes for intrinsics with rounding modes.
19404 // First, we check if the intrinsic may have non-default rounding mode,
19405 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19406 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19407 if (IntrWithRoundingModeOpcode != 0) {
19408 SDValue Rnd = Op.getOperand(5);
19409 if (!isRoundModeCurDirection(Rnd))
19410 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19411 dl, Op.getValueType(),
19412 Src1, Src2, Src3, Rnd),
19413 Mask, PassThru, Subtarget, DAG);
19415 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19416 dl, Op.getValueType(),
19418 Mask, PassThru, Subtarget, DAG);
19420 case FMA_OP_SCALAR_MASK:
19421 case FMA_OP_SCALAR_MASK3:
19422 case FMA_OP_SCALAR_MASKZ: {
19423 SDValue Src1 = Op.getOperand(1);
19424 SDValue Src2 = Op.getOperand(2);
19425 SDValue Src3 = Op.getOperand(3);
19426 SDValue Mask = Op.getOperand(4);
19427 MVT VT = Op.getSimpleValueType();
19428 SDValue PassThru = SDValue();
19430 // set PassThru element
19431 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19432 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19433 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19438 SDValue Rnd = Op.getOperand(5);
19439 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19440 Op.getValueType(), Src1, Src2,
19442 Mask, PassThru, Subtarget, DAG);
19444 case TERLOG_OP_MASK:
19445 case TERLOG_OP_MASKZ: {
19446 SDValue Src1 = Op.getOperand(1);
19447 SDValue Src2 = Op.getOperand(2);
19448 SDValue Src3 = Op.getOperand(3);
19449 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19450 SDValue Mask = Op.getOperand(5);
19451 MVT VT = Op.getSimpleValueType();
19452 SDValue PassThru = Src1;
19453 // Set PassThru element.
19454 if (IntrData->Type == TERLOG_OP_MASKZ)
19455 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19457 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19458 Src1, Src2, Src3, Src4),
19459 Mask, PassThru, Subtarget, DAG);
19462 // ISD::FP_ROUND has a second argument that indicates if the truncation
19463 // does not change the value. Set it to 0 since it can change.
19464 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19465 DAG.getIntPtrConstant(0, dl));
19466 case CVTPD2PS_MASK: {
19467 SDValue Src = Op.getOperand(1);
19468 SDValue PassThru = Op.getOperand(2);
19469 SDValue Mask = Op.getOperand(3);
19470 // We add rounding mode to the Node when
19471 // - RM Opcode is specified and
19472 // - RM is not "current direction".
19473 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19474 if (IntrWithRoundingModeOpcode != 0) {
19475 SDValue Rnd = Op.getOperand(4);
19476 if (!isRoundModeCurDirection(Rnd)) {
19477 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19478 dl, Op.getValueType(),
19480 Mask, PassThru, Subtarget, DAG);
19483 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19484 // ISD::FP_ROUND has a second argument that indicates if the truncation
19485 // does not change the value. Set it to 0 since it can change.
19486 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19487 DAG.getIntPtrConstant(0, dl)),
19488 Mask, PassThru, Subtarget, DAG);
19491 // FPclass intrinsics with mask
19492 SDValue Src1 = Op.getOperand(1);
19493 MVT VT = Src1.getSimpleValueType();
19494 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19495 SDValue Imm = Op.getOperand(2);
19496 SDValue Mask = Op.getOperand(3);
19497 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19498 Mask.getSimpleValueType().getSizeInBits());
19499 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19500 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19501 DAG.getTargetConstant(0, dl, MaskVT),
19503 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19504 DAG.getUNDEF(BitcastVT), FPclassMask,
19505 DAG.getIntPtrConstant(0, dl));
19506 return DAG.getBitcast(Op.getValueType(), Res);
19509 SDValue Src1 = Op.getOperand(1);
19510 SDValue Imm = Op.getOperand(2);
19511 SDValue Mask = Op.getOperand(3);
19512 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
19513 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19514 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19515 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
19516 DAG.getIntPtrConstant(0, dl));
19519 case CMP_MASK_CC: {
19520 // Comparison intrinsics with masks.
19521 // Example of transformation:
19522 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19523 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19525 // (v8i1 (insert_subvector undef,
19526 // (v2i1 (and (PCMPEQM %a, %b),
19527 // (extract_subvector
19528 // (v8i1 (bitcast %mask)), 0))), 0))))
19529 MVT VT = Op.getOperand(1).getSimpleValueType();
19530 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19531 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19532 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19533 Mask.getSimpleValueType().getSizeInBits());
19535 if (IntrData->Type == CMP_MASK_CC) {
19536 SDValue CC = Op.getOperand(3);
19537 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19538 // We specify 2 possible opcodes for intrinsics with rounding modes.
19539 // First, we check if the intrinsic may have non-default rounding mode,
19540 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19541 if (IntrData->Opc1 != 0) {
19542 SDValue Rnd = Op.getOperand(5);
19543 if (!isRoundModeCurDirection(Rnd))
19544 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19545 Op.getOperand(2), CC, Rnd);
19547 //default rounding mode
19549 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19550 Op.getOperand(2), CC);
19553 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19554 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19557 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19558 DAG.getTargetConstant(0, dl,
19561 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19562 DAG.getUNDEF(BitcastVT), CmpMask,
19563 DAG.getIntPtrConstant(0, dl));
19564 return DAG.getBitcast(Op.getValueType(), Res);
19566 case CMP_MASK_SCALAR_CC: {
19567 SDValue Src1 = Op.getOperand(1);
19568 SDValue Src2 = Op.getOperand(2);
19569 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19570 SDValue Mask = Op.getOperand(4);
19573 if (IntrData->Opc1 != 0) {
19574 SDValue Rnd = Op.getOperand(5);
19575 if (!isRoundModeCurDirection(Rnd))
19576 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
19578 //default rounding mode
19580 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
19582 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19583 DAG.getTargetConstant(0, dl,
19586 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
19587 DAG.getIntPtrConstant(0, dl));
19589 case COMI: { // Comparison intrinsics
19590 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19591 SDValue LHS = Op.getOperand(1);
19592 SDValue RHS = Op.getOperand(2);
19593 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19594 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19597 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19598 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19599 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19600 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19603 case ISD::SETNE: { // (ZF = 1 or PF = 1)
19604 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19605 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19606 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19609 case ISD::SETGT: // (CF = 0 and ZF = 0)
19610 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19612 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19613 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19616 case ISD::SETGE: // CF = 0
19617 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19619 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19620 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19623 llvm_unreachable("Unexpected illegal condition!");
19625 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19627 case COMI_RM: { // Comparison intrinsics with Sae
19628 SDValue LHS = Op.getOperand(1);
19629 SDValue RHS = Op.getOperand(2);
19630 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19631 SDValue Sae = Op.getOperand(4);
19634 if (isRoundModeCurDirection(Sae))
19635 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
19636 DAG.getConstant(CondVal, dl, MVT::i8));
19638 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
19639 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19640 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
19641 DAG.getIntPtrConstant(0, dl));
19644 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19645 Op.getOperand(1), Op.getOperand(2), Subtarget,
19647 case COMPRESS_EXPAND_IN_REG: {
19648 SDValue Mask = Op.getOperand(3);
19649 SDValue DataToCompress = Op.getOperand(1);
19650 SDValue PassThru = Op.getOperand(2);
19651 if (isAllOnesConstant(Mask)) // return data as is
19652 return Op.getOperand(1);
19654 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19656 Mask, PassThru, Subtarget, DAG);
19659 SDValue Mask = Op.getOperand(1);
19660 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19661 Mask.getSimpleValueType().getSizeInBits());
19662 Mask = DAG.getBitcast(MaskVT, Mask);
19663 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19666 MVT VT = Op.getSimpleValueType();
19667 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19669 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19670 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19671 // Arguments should be swapped.
19672 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19673 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19675 return DAG.getBitcast(VT, Res);
19678 MVT VT = Op.getSimpleValueType();
19679 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19681 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19682 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19683 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19684 return DAG.getBitcast(VT, Res);
19687 case FIXUPIMMS_MASKZ:
19689 case FIXUPIMM_MASKZ:{
19690 SDValue Src1 = Op.getOperand(1);
19691 SDValue Src2 = Op.getOperand(2);
19692 SDValue Src3 = Op.getOperand(3);
19693 SDValue Imm = Op.getOperand(4);
19694 SDValue Mask = Op.getOperand(5);
19695 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19696 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19697 // We specify 2 possible modes for intrinsics, with/without rounding
19699 // First, we check if the intrinsic have rounding mode (7 operands),
19700 // if not, we set rounding mode to "current".
19702 if (Op.getNumOperands() == 7)
19703 Rnd = Op.getOperand(6);
19705 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19706 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19707 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19708 Src1, Src2, Src3, Imm, Rnd),
19709 Mask, Passthru, Subtarget, DAG);
19710 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19711 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19712 Src1, Src2, Src3, Imm, Rnd),
19713 Mask, Passthru, Subtarget, DAG);
19715 case CONVERT_TO_MASK: {
19716 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19717 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19718 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19720 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19722 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19723 DAG.getUNDEF(BitcastVT), CvtMask,
19724 DAG.getIntPtrConstant(0, dl));
19725 return DAG.getBitcast(Op.getValueType(), Res);
19727 case BRCST_SUBVEC_TO_VEC: {
19728 SDValue Src = Op.getOperand(1);
19729 SDValue Passthru = Op.getOperand(2);
19730 SDValue Mask = Op.getOperand(3);
19731 EVT resVT = Passthru.getValueType();
19732 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19733 DAG.getUNDEF(resVT), Src,
19734 DAG.getIntPtrConstant(0, dl));
19736 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19737 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19739 immVal = DAG.getConstant(0, dl, MVT::i8);
19740 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19741 subVec, subVec, immVal),
19742 Mask, Passthru, Subtarget, DAG);
19744 case BRCST32x2_TO_VEC: {
19745 SDValue Src = Op.getOperand(1);
19746 SDValue PassThru = Op.getOperand(2);
19747 SDValue Mask = Op.getOperand(3);
19749 assert((VT.getScalarType() == MVT::i32 ||
19750 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19751 //bitcast Src to packed 64
19752 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19753 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19754 Src = DAG.getBitcast(BitcastVT, Src);
19756 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19757 Mask, PassThru, Subtarget, DAG);
19765 default: return SDValue(); // Don't custom lower most intrinsics.
19767 case Intrinsic::x86_avx2_permd:
19768 case Intrinsic::x86_avx2_permps:
19769 // Operands intentionally swapped. Mask is last operand to intrinsic,
19770 // but second operand for node/instruction.
19771 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19772 Op.getOperand(2), Op.getOperand(1));
19774 // ptest and testp intrinsics. The intrinsic these come from are designed to
19775 // return an integer value, not just an instruction so lower it to the ptest
19776 // or testp pattern and a setcc for the result.
19777 case Intrinsic::x86_sse41_ptestz:
19778 case Intrinsic::x86_sse41_ptestc:
19779 case Intrinsic::x86_sse41_ptestnzc:
19780 case Intrinsic::x86_avx_ptestz_256:
19781 case Intrinsic::x86_avx_ptestc_256:
19782 case Intrinsic::x86_avx_ptestnzc_256:
19783 case Intrinsic::x86_avx_vtestz_ps:
19784 case Intrinsic::x86_avx_vtestc_ps:
19785 case Intrinsic::x86_avx_vtestnzc_ps:
19786 case Intrinsic::x86_avx_vtestz_pd:
19787 case Intrinsic::x86_avx_vtestc_pd:
19788 case Intrinsic::x86_avx_vtestnzc_pd:
19789 case Intrinsic::x86_avx_vtestz_ps_256:
19790 case Intrinsic::x86_avx_vtestc_ps_256:
19791 case Intrinsic::x86_avx_vtestnzc_ps_256:
19792 case Intrinsic::x86_avx_vtestz_pd_256:
19793 case Intrinsic::x86_avx_vtestc_pd_256:
19794 case Intrinsic::x86_avx_vtestnzc_pd_256: {
19795 bool IsTestPacked = false;
19796 X86::CondCode X86CC;
19798 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19799 case Intrinsic::x86_avx_vtestz_ps:
19800 case Intrinsic::x86_avx_vtestz_pd:
19801 case Intrinsic::x86_avx_vtestz_ps_256:
19802 case Intrinsic::x86_avx_vtestz_pd_256:
19803 IsTestPacked = true;
19805 case Intrinsic::x86_sse41_ptestz:
19806 case Intrinsic::x86_avx_ptestz_256:
19808 X86CC = X86::COND_E;
19810 case Intrinsic::x86_avx_vtestc_ps:
19811 case Intrinsic::x86_avx_vtestc_pd:
19812 case Intrinsic::x86_avx_vtestc_ps_256:
19813 case Intrinsic::x86_avx_vtestc_pd_256:
19814 IsTestPacked = true;
19816 case Intrinsic::x86_sse41_ptestc:
19817 case Intrinsic::x86_avx_ptestc_256:
19819 X86CC = X86::COND_B;
19821 case Intrinsic::x86_avx_vtestnzc_ps:
19822 case Intrinsic::x86_avx_vtestnzc_pd:
19823 case Intrinsic::x86_avx_vtestnzc_ps_256:
19824 case Intrinsic::x86_avx_vtestnzc_pd_256:
19825 IsTestPacked = true;
19827 case Intrinsic::x86_sse41_ptestnzc:
19828 case Intrinsic::x86_avx_ptestnzc_256:
19830 X86CC = X86::COND_A;
19834 SDValue LHS = Op.getOperand(1);
19835 SDValue RHS = Op.getOperand(2);
19836 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19837 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19838 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19839 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19841 case Intrinsic::x86_avx512_kortestz_w:
19842 case Intrinsic::x86_avx512_kortestc_w: {
19843 X86::CondCode X86CC =
19844 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19845 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19846 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19847 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19848 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19849 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19852 case Intrinsic::x86_avx512_knot_w: {
19853 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19854 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
19855 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19856 return DAG.getBitcast(MVT::i16, Res);
19859 case Intrinsic::x86_avx512_kandn_w: {
19860 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19861 // Invert LHS for the not.
19862 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
19863 DAG.getConstant(1, dl, MVT::v16i1));
19864 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19865 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
19866 return DAG.getBitcast(MVT::i16, Res);
19869 case Intrinsic::x86_avx512_kxnor_w: {
19870 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19871 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19872 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19873 // Invert result for the not.
19874 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
19875 DAG.getConstant(1, dl, MVT::v16i1));
19876 return DAG.getBitcast(MVT::i16, Res);
19879 case Intrinsic::x86_sse42_pcmpistria128:
19880 case Intrinsic::x86_sse42_pcmpestria128:
19881 case Intrinsic::x86_sse42_pcmpistric128:
19882 case Intrinsic::x86_sse42_pcmpestric128:
19883 case Intrinsic::x86_sse42_pcmpistrio128:
19884 case Intrinsic::x86_sse42_pcmpestrio128:
19885 case Intrinsic::x86_sse42_pcmpistris128:
19886 case Intrinsic::x86_sse42_pcmpestris128:
19887 case Intrinsic::x86_sse42_pcmpistriz128:
19888 case Intrinsic::x86_sse42_pcmpestriz128: {
19890 X86::CondCode X86CC;
19892 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
19893 case Intrinsic::x86_sse42_pcmpistria128:
19894 Opcode = X86ISD::PCMPISTRI;
19895 X86CC = X86::COND_A;
19897 case Intrinsic::x86_sse42_pcmpestria128:
19898 Opcode = X86ISD::PCMPESTRI;
19899 X86CC = X86::COND_A;
19901 case Intrinsic::x86_sse42_pcmpistric128:
19902 Opcode = X86ISD::PCMPISTRI;
19903 X86CC = X86::COND_B;
19905 case Intrinsic::x86_sse42_pcmpestric128:
19906 Opcode = X86ISD::PCMPESTRI;
19907 X86CC = X86::COND_B;
19909 case Intrinsic::x86_sse42_pcmpistrio128:
19910 Opcode = X86ISD::PCMPISTRI;
19911 X86CC = X86::COND_O;
19913 case Intrinsic::x86_sse42_pcmpestrio128:
19914 Opcode = X86ISD::PCMPESTRI;
19915 X86CC = X86::COND_O;
19917 case Intrinsic::x86_sse42_pcmpistris128:
19918 Opcode = X86ISD::PCMPISTRI;
19919 X86CC = X86::COND_S;
19921 case Intrinsic::x86_sse42_pcmpestris128:
19922 Opcode = X86ISD::PCMPESTRI;
19923 X86CC = X86::COND_S;
19925 case Intrinsic::x86_sse42_pcmpistriz128:
19926 Opcode = X86ISD::PCMPISTRI;
19927 X86CC = X86::COND_E;
19929 case Intrinsic::x86_sse42_pcmpestriz128:
19930 Opcode = X86ISD::PCMPESTRI;
19931 X86CC = X86::COND_E;
19934 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19935 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19936 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
19937 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
19938 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19941 case Intrinsic::x86_sse42_pcmpistri128:
19942 case Intrinsic::x86_sse42_pcmpestri128: {
19944 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
19945 Opcode = X86ISD::PCMPISTRI;
19947 Opcode = X86ISD::PCMPESTRI;
19949 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19950 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19951 return DAG.getNode(Opcode, dl, VTs, NewOps);
19954 case Intrinsic::eh_sjlj_lsda: {
19955 MachineFunction &MF = DAG.getMachineFunction();
19956 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19957 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19958 auto &Context = MF.getMMI().getContext();
19959 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
19960 Twine(MF.getFunctionNumber()));
19961 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
19964 case Intrinsic::x86_seh_lsda: {
19965 // Compute the symbol for the LSDA. We know it'll get emitted later.
19966 MachineFunction &MF = DAG.getMachineFunction();
19967 SDValue Op1 = Op.getOperand(1);
19968 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
19969 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
19970 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19972 // Generate a simple absolute symbol reference. This intrinsic is only
19973 // supported on 32-bit Windows, which isn't PIC.
19974 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
19975 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
19978 case Intrinsic::x86_seh_recoverfp: {
19979 SDValue FnOp = Op.getOperand(1);
19980 SDValue IncomingFPOp = Op.getOperand(2);
19981 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
19982 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
19984 report_fatal_error(
19985 "llvm.x86.seh.recoverfp must take a function as the first argument");
19986 return recoverFramePointer(DAG, Fn, IncomingFPOp);
19989 case Intrinsic::localaddress: {
19990 // Returns one of the stack, base, or frame pointer registers, depending on
19991 // which is used to reference local variables.
19992 MachineFunction &MF = DAG.getMachineFunction();
19993 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19995 if (RegInfo->hasBasePointer(MF))
19996 Reg = RegInfo->getBaseRegister();
19997 else // This function handles the SP or FP case.
19998 Reg = RegInfo->getPtrSizedFrameRegister(MF);
19999 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20004 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20005 SDValue Src, SDValue Mask, SDValue Base,
20006 SDValue Index, SDValue ScaleOp, SDValue Chain,
20007 const X86Subtarget &Subtarget) {
20009 auto *C = cast<ConstantSDNode>(ScaleOp);
20010 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20011 EVT MaskVT = Mask.getValueType();
20012 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20013 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20014 SDValue Segment = DAG.getRegister(0, MVT::i32);
20015 // If source is undef or we know it won't be used, use a zero vector
20016 // to break register dependency.
20017 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20018 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20019 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20020 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20021 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20022 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20023 return DAG.getMergeValues(RetOps, dl);
20026 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20027 SDValue Src, SDValue Mask, SDValue Base,
20028 SDValue Index, SDValue ScaleOp, SDValue Chain,
20029 const X86Subtarget &Subtarget) {
20031 auto *C = cast<ConstantSDNode>(ScaleOp);
20032 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20033 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20034 Index.getSimpleValueType().getVectorNumElements());
20036 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20037 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20038 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20039 SDValue Segment = DAG.getRegister(0, MVT::i32);
20040 // If source is undef or we know it won't be used, use a zero vector
20041 // to break register dependency.
20042 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20043 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20044 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20045 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20046 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20047 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20048 return DAG.getMergeValues(RetOps, dl);
20051 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20052 SDValue Src, SDValue Mask, SDValue Base,
20053 SDValue Index, SDValue ScaleOp, SDValue Chain,
20054 const X86Subtarget &Subtarget) {
20056 auto *C = cast<ConstantSDNode>(ScaleOp);
20057 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20058 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20059 SDValue Segment = DAG.getRegister(0, MVT::i32);
20060 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20061 Index.getSimpleValueType().getVectorNumElements());
20063 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20064 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20065 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20066 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20067 return SDValue(Res, 1);
20070 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20071 SDValue Mask, SDValue Base, SDValue Index,
20072 SDValue ScaleOp, SDValue Chain,
20073 const X86Subtarget &Subtarget) {
20075 auto *C = cast<ConstantSDNode>(ScaleOp);
20076 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20077 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20078 SDValue Segment = DAG.getRegister(0, MVT::i32);
20080 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20081 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20082 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20083 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20084 return SDValue(Res, 0);
20087 /// Handles the lowering of builtin intrinsic that return the value
20088 /// of the extended control register.
20089 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20091 const X86Subtarget &Subtarget,
20092 SmallVectorImpl<SDValue> &Results) {
20093 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20094 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20097 // The ECX register is used to select the index of the XCR register to
20100 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20101 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20102 Chain = SDValue(N1, 0);
20104 // Reads the content of XCR and returns it in registers EDX:EAX.
20105 if (Subtarget.is64Bit()) {
20106 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20107 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20110 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20111 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20114 Chain = HI.getValue(1);
20116 if (Subtarget.is64Bit()) {
20117 // Merge the two 32-bit values into a 64-bit one..
20118 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20119 DAG.getConstant(32, DL, MVT::i8));
20120 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20121 Results.push_back(Chain);
20125 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20126 SDValue Ops[] = { LO, HI };
20127 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20128 Results.push_back(Pair);
20129 Results.push_back(Chain);
20132 /// Handles the lowering of builtin intrinsics that read performance monitor
20133 /// counters (x86_rdpmc).
20134 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20136 const X86Subtarget &Subtarget,
20137 SmallVectorImpl<SDValue> &Results) {
20138 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20139 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20142 // The ECX register is used to select the index of the performance counter
20144 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20146 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20148 // Reads the content of a 64-bit performance counter and returns it in the
20149 // registers EDX:EAX.
20150 if (Subtarget.is64Bit()) {
20151 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20152 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20155 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20156 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20159 Chain = HI.getValue(1);
20161 if (Subtarget.is64Bit()) {
20162 // The EAX register is loaded with the low-order 32 bits. The EDX register
20163 // is loaded with the supported high-order bits of the counter.
20164 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20165 DAG.getConstant(32, DL, MVT::i8));
20166 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20167 Results.push_back(Chain);
20171 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20172 SDValue Ops[] = { LO, HI };
20173 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20174 Results.push_back(Pair);
20175 Results.push_back(Chain);
20178 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20179 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20180 /// READCYCLECOUNTER nodes.
20181 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20183 const X86Subtarget &Subtarget,
20184 SmallVectorImpl<SDValue> &Results) {
20185 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20186 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20189 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20190 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20191 // and the EAX register is loaded with the low-order 32 bits.
20192 if (Subtarget.is64Bit()) {
20193 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20194 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20197 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20198 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20201 SDValue Chain = HI.getValue(1);
20203 if (Opcode == X86ISD::RDTSCP_DAG) {
20204 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20206 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20207 // the ECX register. Add 'ecx' explicitly to the chain.
20208 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20210 // Explicitly store the content of ECX at the location passed in input
20211 // to the 'rdtscp' intrinsic.
20212 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20213 MachinePointerInfo());
20216 if (Subtarget.is64Bit()) {
20217 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20218 // the EAX register is loaded with the low-order 32 bits.
20219 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20220 DAG.getConstant(32, DL, MVT::i8));
20221 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20222 Results.push_back(Chain);
20226 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20227 SDValue Ops[] = { LO, HI };
20228 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20229 Results.push_back(Pair);
20230 Results.push_back(Chain);
20233 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20234 SelectionDAG &DAG) {
20235 SmallVector<SDValue, 2> Results;
20237 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20239 return DAG.getMergeValues(Results, DL);
20242 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20243 MachineFunction &MF = DAG.getMachineFunction();
20244 SDValue Chain = Op.getOperand(0);
20245 SDValue RegNode = Op.getOperand(2);
20246 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20248 report_fatal_error("EH registrations only live in functions using WinEH");
20250 // Cast the operand to an alloca, and remember the frame index.
20251 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20253 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20254 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20256 // Return the chain operand without making any DAG nodes.
20260 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20261 MachineFunction &MF = DAG.getMachineFunction();
20262 SDValue Chain = Op.getOperand(0);
20263 SDValue EHGuard = Op.getOperand(2);
20264 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20266 report_fatal_error("EHGuard only live in functions using WinEH");
20268 // Cast the operand to an alloca, and remember the frame index.
20269 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20271 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20272 EHInfo->EHGuardFrameIndex = FINode->getIndex();
20274 // Return the chain operand without making any DAG nodes.
20278 /// Emit Truncating Store with signed or unsigned saturation.
20280 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20281 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20282 SelectionDAG &DAG) {
20284 SDVTList VTs = DAG.getVTList(MVT::Other);
20285 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20286 SDValue Ops[] = { Chain, Val, Ptr, Undef };
20288 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20289 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20292 /// Emit Masked Truncating Store with signed or unsigned saturation.
20294 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20295 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20296 MachineMemOperand *MMO, SelectionDAG &DAG) {
20298 SDVTList VTs = DAG.getVTList(MVT::Other);
20299 SDValue Ops[] = { Chain, Ptr, Mask, Val };
20301 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20302 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20305 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20306 SelectionDAG &DAG) {
20307 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20309 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20312 case llvm::Intrinsic::x86_seh_ehregnode:
20313 return MarkEHRegistrationNode(Op, DAG);
20314 case llvm::Intrinsic::x86_seh_ehguard:
20315 return MarkEHGuard(Op, DAG);
20316 case llvm::Intrinsic::x86_flags_read_u32:
20317 case llvm::Intrinsic::x86_flags_read_u64:
20318 case llvm::Intrinsic::x86_flags_write_u32:
20319 case llvm::Intrinsic::x86_flags_write_u64: {
20320 // We need a frame pointer because this will get lowered to a PUSH/POP
20322 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20323 MFI.setHasCopyImplyingStackAdjustment(true);
20324 // Don't do anything here, we will expand these intrinsics out later
20325 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20328 case Intrinsic::x86_lwpins32:
20329 case Intrinsic::x86_lwpins64: {
20331 SDValue Chain = Op->getOperand(0);
20332 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20334 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20335 Op->getOperand(3), Op->getOperand(4));
20336 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20337 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20338 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20339 LwpIns.getValue(1));
20346 switch(IntrData->Type) {
20347 default: llvm_unreachable("Unknown Intrinsic Type");
20350 // Emit the node with the right value type.
20351 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20352 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20354 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20355 // Otherwise return the value from Rand, which is always 0, casted to i32.
20356 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20357 DAG.getConstant(1, dl, Op->getValueType(1)),
20358 DAG.getConstant(X86::COND_B, dl, MVT::i32),
20359 SDValue(Result.getNode(), 1) };
20360 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20361 DAG.getVTList(Op->getValueType(1), MVT::Glue),
20364 // Return { result, isValid, chain }.
20365 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20366 SDValue(Result.getNode(), 2));
20368 case GATHER_AVX2: {
20369 SDValue Chain = Op.getOperand(0);
20370 SDValue Src = Op.getOperand(2);
20371 SDValue Base = Op.getOperand(3);
20372 SDValue Index = Op.getOperand(4);
20373 SDValue Mask = Op.getOperand(5);
20374 SDValue Scale = Op.getOperand(6);
20375 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20376 Scale, Chain, Subtarget);
20379 //gather(v1, mask, index, base, scale);
20380 SDValue Chain = Op.getOperand(0);
20381 SDValue Src = Op.getOperand(2);
20382 SDValue Base = Op.getOperand(3);
20383 SDValue Index = Op.getOperand(4);
20384 SDValue Mask = Op.getOperand(5);
20385 SDValue Scale = Op.getOperand(6);
20386 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20390 //scatter(base, mask, index, v1, scale);
20391 SDValue Chain = Op.getOperand(0);
20392 SDValue Base = Op.getOperand(2);
20393 SDValue Mask = Op.getOperand(3);
20394 SDValue Index = Op.getOperand(4);
20395 SDValue Src = Op.getOperand(5);
20396 SDValue Scale = Op.getOperand(6);
20397 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20398 Scale, Chain, Subtarget);
20401 SDValue Hint = Op.getOperand(6);
20402 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20403 assert((HintVal == 2 || HintVal == 3) &&
20404 "Wrong prefetch hint in intrinsic: should be 2 or 3");
20405 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20406 SDValue Chain = Op.getOperand(0);
20407 SDValue Mask = Op.getOperand(2);
20408 SDValue Index = Op.getOperand(3);
20409 SDValue Base = Op.getOperand(4);
20410 SDValue Scale = Op.getOperand(5);
20411 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20414 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20416 SmallVector<SDValue, 2> Results;
20417 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20419 return DAG.getMergeValues(Results, dl);
20421 // Read Performance Monitoring Counters.
20423 SmallVector<SDValue, 2> Results;
20424 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20425 return DAG.getMergeValues(Results, dl);
20427 // Get Extended Control Register.
20429 SmallVector<SDValue, 2> Results;
20430 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20431 return DAG.getMergeValues(Results, dl);
20433 // XTEST intrinsics.
20435 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20436 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20438 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20439 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20440 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20441 Ret, SDValue(InTrans.getNode(), 1));
20445 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20446 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
20447 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20448 DAG.getConstant(-1, dl, MVT::i8));
20449 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20450 Op.getOperand(4), GenCF.getValue(1));
20451 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20452 Op.getOperand(5), MachinePointerInfo());
20453 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20454 SDValue Results[] = { SetCC, Store };
20455 return DAG.getMergeValues(Results, dl);
20457 case COMPRESS_TO_MEM: {
20458 SDValue Mask = Op.getOperand(4);
20459 SDValue DataToCompress = Op.getOperand(3);
20460 SDValue Addr = Op.getOperand(2);
20461 SDValue Chain = Op.getOperand(0);
20462 MVT VT = DataToCompress.getSimpleValueType();
20464 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20465 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20467 if (isAllOnesConstant(Mask)) // return just a store
20468 return DAG.getStore(Chain, dl, DataToCompress, Addr,
20469 MemIntr->getMemOperand());
20471 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20472 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20474 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20475 MemIntr->getMemOperand(),
20476 false /* truncating */, true /* compressing */);
20478 case TRUNCATE_TO_MEM_VI8:
20479 case TRUNCATE_TO_MEM_VI16:
20480 case TRUNCATE_TO_MEM_VI32: {
20481 SDValue Mask = Op.getOperand(4);
20482 SDValue DataToTruncate = Op.getOperand(3);
20483 SDValue Addr = Op.getOperand(2);
20484 SDValue Chain = Op.getOperand(0);
20486 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20487 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20489 EVT MemVT = MemIntr->getMemoryVT();
20491 uint16_t TruncationOp = IntrData->Opc0;
20492 switch (TruncationOp) {
20493 case X86ISD::VTRUNC: {
20494 if (isAllOnesConstant(Mask)) // return just a truncate store
20495 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20496 MemIntr->getMemOperand());
20498 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20499 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20501 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20502 MemIntr->getMemOperand(), true /* truncating */);
20504 case X86ISD::VTRUNCUS:
20505 case X86ISD::VTRUNCS: {
20506 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20507 if (isAllOnesConstant(Mask))
20508 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20509 MemIntr->getMemOperand(), DAG);
20511 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20512 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20514 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20515 VMask, MemVT, MemIntr->getMemOperand(), DAG);
20518 llvm_unreachable("Unsupported truncstore intrinsic");
20522 case EXPAND_FROM_MEM: {
20523 SDValue Mask = Op.getOperand(4);
20524 SDValue PassThru = Op.getOperand(3);
20525 SDValue Addr = Op.getOperand(2);
20526 SDValue Chain = Op.getOperand(0);
20527 MVT VT = Op.getSimpleValueType();
20529 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20530 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20532 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20533 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20534 if (X86::isZeroNode(Mask))
20535 return DAG.getUNDEF(VT);
20537 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20538 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20539 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20540 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20541 true /* expanding */);
20546 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20547 SelectionDAG &DAG) const {
20548 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20549 MFI.setReturnAddressIsTaken(true);
20551 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20554 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20556 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20559 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20560 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20561 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20562 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20563 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20564 MachinePointerInfo());
20567 // Just load the return address.
20568 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20569 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20570 MachinePointerInfo());
20573 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20574 SelectionDAG &DAG) const {
20575 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20576 return getReturnAddressFrameIndex(DAG);
20579 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20580 MachineFunction &MF = DAG.getMachineFunction();
20581 MachineFrameInfo &MFI = MF.getFrameInfo();
20582 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20583 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20584 EVT VT = Op.getValueType();
20586 MFI.setFrameAddressIsTaken(true);
20588 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20589 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
20590 // is not possible to crawl up the stack without looking at the unwind codes
20592 int FrameAddrIndex = FuncInfo->getFAIndex();
20593 if (!FrameAddrIndex) {
20594 // Set up a frame object for the return address.
20595 unsigned SlotSize = RegInfo->getSlotSize();
20596 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20597 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20598 FuncInfo->setFAIndex(FrameAddrIndex);
20600 return DAG.getFrameIndex(FrameAddrIndex, VT);
20603 unsigned FrameReg =
20604 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20605 SDLoc dl(Op); // FIXME probably not meaningful
20606 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20607 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20608 (FrameReg == X86::EBP && VT == MVT::i32)) &&
20609 "Invalid Frame Register!");
20610 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20612 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20613 MachinePointerInfo());
20617 // FIXME? Maybe this could be a TableGen attribute on some registers and
20618 // this table could be generated automatically from RegInfo.
20619 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20620 SelectionDAG &DAG) const {
20621 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20622 const MachineFunction &MF = DAG.getMachineFunction();
20624 unsigned Reg = StringSwitch<unsigned>(RegName)
20625 .Case("esp", X86::ESP)
20626 .Case("rsp", X86::RSP)
20627 .Case("ebp", X86::EBP)
20628 .Case("rbp", X86::RBP)
20631 if (Reg == X86::EBP || Reg == X86::RBP) {
20632 if (!TFI.hasFP(MF))
20633 report_fatal_error("register " + StringRef(RegName) +
20634 " is allocatable: function has no frame pointer");
20637 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20638 unsigned FrameReg =
20639 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20640 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20641 "Invalid Frame Register!");
20649 report_fatal_error("Invalid register name global variable");
20652 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20653 SelectionDAG &DAG) const {
20654 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20655 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20658 unsigned X86TargetLowering::getExceptionPointerRegister(
20659 const Constant *PersonalityFn) const {
20660 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20661 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20663 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20666 unsigned X86TargetLowering::getExceptionSelectorRegister(
20667 const Constant *PersonalityFn) const {
20668 // Funclet personalities don't use selectors (the runtime does the selection).
20669 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20670 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20673 bool X86TargetLowering::needsFixedCatchObjects() const {
20674 return Subtarget.isTargetWin64();
20677 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20678 SDValue Chain = Op.getOperand(0);
20679 SDValue Offset = Op.getOperand(1);
20680 SDValue Handler = Op.getOperand(2);
20683 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20684 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20685 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20686 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20687 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20688 "Invalid Frame Register!");
20689 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20690 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20692 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20693 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20695 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20696 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20697 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20699 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20700 DAG.getRegister(StoreAddrReg, PtrVT));
20703 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20704 SelectionDAG &DAG) const {
20706 // If the subtarget is not 64bit, we may need the global base reg
20707 // after isel expand pseudo, i.e., after CGBR pass ran.
20708 // Therefore, ask for the GlobalBaseReg now, so that the pass
20709 // inserts the code for us in case we need it.
20710 // Otherwise, we will end up in a situation where we will
20711 // reference a virtual register that is not defined!
20712 if (!Subtarget.is64Bit()) {
20713 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20714 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20716 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20717 DAG.getVTList(MVT::i32, MVT::Other),
20718 Op.getOperand(0), Op.getOperand(1));
20721 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20722 SelectionDAG &DAG) const {
20724 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20725 Op.getOperand(0), Op.getOperand(1));
20728 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20729 SelectionDAG &DAG) const {
20731 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20735 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20736 return Op.getOperand(0);
20739 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20740 SelectionDAG &DAG) const {
20741 SDValue Root = Op.getOperand(0);
20742 SDValue Trmp = Op.getOperand(1); // trampoline
20743 SDValue FPtr = Op.getOperand(2); // nested function
20744 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20747 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20748 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20750 if (Subtarget.is64Bit()) {
20751 SDValue OutChains[6];
20753 // Large code-model.
20754 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20755 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20757 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20758 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20760 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20762 // Load the pointer to the nested function into R11.
20763 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20764 SDValue Addr = Trmp;
20765 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20766 Addr, MachinePointerInfo(TrmpAddr));
20768 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20769 DAG.getConstant(2, dl, MVT::i64));
20771 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20772 /* Alignment = */ 2);
20774 // Load the 'nest' parameter value into R10.
20775 // R10 is specified in X86CallingConv.td
20776 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20777 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20778 DAG.getConstant(10, dl, MVT::i64));
20779 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20780 Addr, MachinePointerInfo(TrmpAddr, 10));
20782 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20783 DAG.getConstant(12, dl, MVT::i64));
20785 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20786 /* Alignment = */ 2);
20788 // Jump to the nested function.
20789 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20790 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20791 DAG.getConstant(20, dl, MVT::i64));
20792 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20793 Addr, MachinePointerInfo(TrmpAddr, 20));
20795 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20796 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20797 DAG.getConstant(22, dl, MVT::i64));
20798 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20799 Addr, MachinePointerInfo(TrmpAddr, 22));
20801 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20803 const Function *Func =
20804 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20805 CallingConv::ID CC = Func->getCallingConv();
20810 llvm_unreachable("Unsupported calling convention");
20811 case CallingConv::C:
20812 case CallingConv::X86_StdCall: {
20813 // Pass 'nest' parameter in ECX.
20814 // Must be kept in sync with X86CallingConv.td
20815 NestReg = X86::ECX;
20817 // Check that ECX wasn't needed by an 'inreg' parameter.
20818 FunctionType *FTy = Func->getFunctionType();
20819 const AttributeList &Attrs = Func->getAttributes();
20821 if (!Attrs.isEmpty() && !Func->isVarArg()) {
20822 unsigned InRegCount = 0;
20825 for (FunctionType::param_iterator I = FTy->param_begin(),
20826 E = FTy->param_end(); I != E; ++I, ++Idx)
20827 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20828 auto &DL = DAG.getDataLayout();
20829 // FIXME: should only count parameters that are lowered to integers.
20830 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20833 if (InRegCount > 2) {
20834 report_fatal_error("Nest register in use - reduce number of inreg"
20840 case CallingConv::X86_FastCall:
20841 case CallingConv::X86_ThisCall:
20842 case CallingConv::Fast:
20843 // Pass 'nest' parameter in EAX.
20844 // Must be kept in sync with X86CallingConv.td
20845 NestReg = X86::EAX;
20849 SDValue OutChains[4];
20850 SDValue Addr, Disp;
20852 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20853 DAG.getConstant(10, dl, MVT::i32));
20854 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
20856 // This is storing the opcode for MOV32ri.
20857 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
20858 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
20860 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
20861 Trmp, MachinePointerInfo(TrmpAddr));
20863 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20864 DAG.getConstant(1, dl, MVT::i32));
20866 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
20867 /* Alignment = */ 1);
20869 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
20870 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20871 DAG.getConstant(5, dl, MVT::i32));
20872 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
20873 Addr, MachinePointerInfo(TrmpAddr, 5),
20874 /* Alignment = */ 1);
20876 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20877 DAG.getConstant(6, dl, MVT::i32));
20879 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
20880 /* Alignment = */ 1);
20882 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20886 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
20887 SelectionDAG &DAG) const {
20889 The rounding mode is in bits 11:10 of FPSR, and has the following
20891 00 Round to nearest
20896 FLT_ROUNDS, on the other hand, expects the following:
20903 To perform the conversion, we do:
20904 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
20907 MachineFunction &MF = DAG.getMachineFunction();
20908 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20909 unsigned StackAlignment = TFI.getStackAlignment();
20910 MVT VT = Op.getSimpleValueType();
20913 // Save FP Control Word to stack slot
20914 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
20915 SDValue StackSlot =
20916 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
20918 MachineMemOperand *MMO =
20919 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
20920 MachineMemOperand::MOStore, 2, 2);
20922 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
20923 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
20924 DAG.getVTList(MVT::Other),
20925 Ops, MVT::i16, MMO);
20927 // Load FP Control Word from stack slot
20929 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
20931 // Transform as necessary
20933 DAG.getNode(ISD::SRL, DL, MVT::i16,
20934 DAG.getNode(ISD::AND, DL, MVT::i16,
20935 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
20936 DAG.getConstant(11, DL, MVT::i8));
20938 DAG.getNode(ISD::SRL, DL, MVT::i16,
20939 DAG.getNode(ISD::AND, DL, MVT::i16,
20940 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
20941 DAG.getConstant(9, DL, MVT::i8));
20944 DAG.getNode(ISD::AND, DL, MVT::i16,
20945 DAG.getNode(ISD::ADD, DL, MVT::i16,
20946 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
20947 DAG.getConstant(1, DL, MVT::i16)),
20948 DAG.getConstant(3, DL, MVT::i16));
20950 return DAG.getNode((VT.getSizeInBits() < 16 ?
20951 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
20954 // Split an unary integer op into 2 half sized ops.
20955 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
20956 MVT VT = Op.getSimpleValueType();
20957 unsigned NumElems = VT.getVectorNumElements();
20958 unsigned SizeInBits = VT.getSizeInBits();
20960 // Extract the Lo/Hi vectors
20962 SDValue Src = Op.getOperand(0);
20963 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
20964 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
20966 MVT EltVT = VT.getVectorElementType();
20967 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
20968 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20969 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
20970 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
20973 // Decompose 256-bit ops into smaller 128-bit ops.
20974 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
20975 assert(Op.getSimpleValueType().is256BitVector() &&
20976 Op.getSimpleValueType().isInteger() &&
20977 "Only handle AVX 256-bit vector integer operation");
20978 return LowerVectorIntUnary(Op, DAG);
20981 // Decompose 512-bit ops into smaller 256-bit ops.
20982 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
20983 assert(Op.getSimpleValueType().is512BitVector() &&
20984 Op.getSimpleValueType().isInteger() &&
20985 "Only handle AVX 512-bit vector integer operation");
20986 return LowerVectorIntUnary(Op, DAG);
20989 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
20991 // i8/i16 vector implemented using dword LZCNT vector instruction
20992 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
20993 // split the vector, perform operation on it's Lo a Hi part and
20994 // concatenate the results.
20995 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
20996 assert(Op.getOpcode() == ISD::CTLZ);
20998 MVT VT = Op.getSimpleValueType();
20999 MVT EltVT = VT.getVectorElementType();
21000 unsigned NumElems = VT.getVectorNumElements();
21002 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21003 "Unsupported element type");
21005 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21007 return LowerVectorIntUnary(Op, DAG);
21009 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21010 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21011 "Unsupported value type for operation");
21013 // Use native supported vector instruction vplzcntd.
21014 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21015 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21016 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21017 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21019 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21022 // Lower CTLZ using a PSHUFB lookup table implementation.
21023 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21024 const X86Subtarget &Subtarget,
21025 SelectionDAG &DAG) {
21026 MVT VT = Op.getSimpleValueType();
21027 int NumElts = VT.getVectorNumElements();
21028 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21029 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21031 // Per-nibble leading zero PSHUFB lookup table.
21032 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21033 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21034 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21035 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21037 SmallVector<SDValue, 64> LUTVec;
21038 for (int i = 0; i < NumBytes; ++i)
21039 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21040 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21042 // Begin by bitcasting the input to byte vector, then split those bytes
21043 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21044 // If the hi input nibble is zero then we add both results together, otherwise
21045 // we just take the hi result (by masking the lo result to zero before the
21047 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21048 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21050 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21051 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21052 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21053 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21054 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21056 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21057 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21058 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21059 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21061 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21062 // of the current vector width in the same way we did for the nibbles.
21063 // If the upper half of the input element is zero then add the halves'
21064 // leading zero counts together, otherwise just use the upper half's.
21065 // Double the width of the result until we are at target width.
21066 while (CurrVT != VT) {
21067 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21068 int CurrNumElts = CurrVT.getVectorNumElements();
21069 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21070 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21071 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21073 // Check if the upper half of the input element is zero.
21074 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21075 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21076 HiZ = DAG.getBitcast(NextVT, HiZ);
21078 // Move the upper/lower halves to the lower bits as we'll be extending to
21079 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21081 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21082 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21083 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21084 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21085 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21092 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21093 const X86Subtarget &Subtarget,
21094 SelectionDAG &DAG) {
21095 MVT VT = Op.getSimpleValueType();
21097 if (Subtarget.hasCDI())
21098 return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21100 // Decompose 256-bit ops into smaller 128-bit ops.
21101 if (VT.is256BitVector() && !Subtarget.hasInt256())
21102 return Lower256IntUnary(Op, DAG);
21104 // Decompose 512-bit ops into smaller 256-bit ops.
21105 if (VT.is512BitVector() && !Subtarget.hasBWI())
21106 return Lower512IntUnary(Op, DAG);
21108 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21109 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21112 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21113 SelectionDAG &DAG) {
21114 MVT VT = Op.getSimpleValueType();
21116 unsigned NumBits = VT.getSizeInBits();
21118 unsigned Opc = Op.getOpcode();
21121 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21123 Op = Op.getOperand(0);
21124 if (VT == MVT::i8) {
21125 // Zero extend to i32 since there is not an i8 bsr.
21127 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21130 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21131 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21132 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21134 if (Opc == ISD::CTLZ) {
21135 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21138 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21139 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21142 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21145 // Finally xor with NumBits-1.
21146 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21147 DAG.getConstant(NumBits - 1, dl, OpVT));
21150 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21154 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21155 MVT VT = Op.getSimpleValueType();
21156 unsigned NumBits = VT.getScalarSizeInBits();
21159 if (VT.isVector()) {
21160 SDValue N0 = Op.getOperand(0);
21161 SDValue Zero = DAG.getConstant(0, dl, VT);
21163 // lsb(x) = (x & -x)
21164 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21165 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21167 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21168 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21169 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21170 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21171 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21174 // cttz(x) = ctpop(lsb - 1)
21175 SDValue One = DAG.getConstant(1, dl, VT);
21176 return DAG.getNode(ISD::CTPOP, dl, VT,
21177 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21180 assert(Op.getOpcode() == ISD::CTTZ &&
21181 "Only scalar CTTZ requires custom lowering");
21183 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21184 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21185 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21187 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21190 DAG.getConstant(NumBits, dl, VT),
21191 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21194 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21197 /// Break a 256-bit integer operation into two new 128-bit ones and then
21198 /// concatenate the result back.
21199 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21200 MVT VT = Op.getSimpleValueType();
21202 assert(VT.is256BitVector() && VT.isInteger() &&
21203 "Unsupported value type for operation");
21205 unsigned NumElems = VT.getVectorNumElements();
21208 // Extract the LHS vectors
21209 SDValue LHS = Op.getOperand(0);
21210 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21211 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21213 // Extract the RHS vectors
21214 SDValue RHS = Op.getOperand(1);
21215 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21216 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21218 MVT EltVT = VT.getVectorElementType();
21219 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21221 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21222 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21223 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21226 /// Break a 512-bit integer operation into two new 256-bit ones and then
21227 /// concatenate the result back.
21228 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21229 MVT VT = Op.getSimpleValueType();
21231 assert(VT.is512BitVector() && VT.isInteger() &&
21232 "Unsupported value type for operation");
21234 unsigned NumElems = VT.getVectorNumElements();
21237 // Extract the LHS vectors
21238 SDValue LHS = Op.getOperand(0);
21239 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21240 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21242 // Extract the RHS vectors
21243 SDValue RHS = Op.getOperand(1);
21244 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21245 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21247 MVT EltVT = VT.getVectorElementType();
21248 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21250 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21251 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21252 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21255 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21256 MVT VT = Op.getSimpleValueType();
21257 if (VT.getScalarType() == MVT::i1)
21258 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21259 Op.getOperand(0), Op.getOperand(1));
21260 assert(Op.getSimpleValueType().is256BitVector() &&
21261 Op.getSimpleValueType().isInteger() &&
21262 "Only handle AVX 256-bit vector integer operation");
21263 return Lower256IntArith(Op, DAG);
21266 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21267 assert(Op.getSimpleValueType().is256BitVector() &&
21268 Op.getSimpleValueType().isInteger() &&
21269 "Only handle AVX 256-bit vector integer operation");
21270 return Lower256IntUnary(Op, DAG);
21273 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21274 assert(Op.getSimpleValueType().is256BitVector() &&
21275 Op.getSimpleValueType().isInteger() &&
21276 "Only handle AVX 256-bit vector integer operation");
21277 return Lower256IntArith(Op, DAG);
21280 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21281 SelectionDAG &DAG) {
21283 MVT VT = Op.getSimpleValueType();
21285 if (VT.getScalarType() == MVT::i1)
21286 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21288 // Decompose 256-bit ops into smaller 128-bit ops.
21289 if (VT.is256BitVector() && !Subtarget.hasInt256())
21290 return Lower256IntArith(Op, DAG);
21292 SDValue A = Op.getOperand(0);
21293 SDValue B = Op.getOperand(1);
21295 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21296 // vector pairs, multiply and truncate.
21297 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21298 if (Subtarget.hasInt256()) {
21299 // For 512-bit vectors, split into 256-bit vectors to allow the
21300 // sign-extension to occur.
21301 if (VT == MVT::v64i8)
21302 return Lower512IntArith(Op, DAG);
21304 // For 256-bit vectors, split into 128-bit vectors to allow the
21305 // sign-extension to occur. We don't need this on AVX512BW as we can
21306 // safely sign-extend to v32i16.
21307 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21308 return Lower256IntArith(Op, DAG);
21310 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21311 return DAG.getNode(
21312 ISD::TRUNCATE, dl, VT,
21313 DAG.getNode(ISD::MUL, dl, ExVT,
21314 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21315 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21318 assert(VT == MVT::v16i8 &&
21319 "Pre-AVX2 support only supports v16i8 multiplication");
21320 MVT ExVT = MVT::v8i16;
21322 // Extract the lo parts and sign extend to i16
21324 if (Subtarget.hasSSE41()) {
21325 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21326 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21328 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21329 -1, 4, -1, 5, -1, 6, -1, 7};
21330 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21331 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21332 ALo = DAG.getBitcast(ExVT, ALo);
21333 BLo = DAG.getBitcast(ExVT, BLo);
21334 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21335 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21338 // Extract the hi parts and sign extend to i16
21340 if (Subtarget.hasSSE41()) {
21341 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21342 -1, -1, -1, -1, -1, -1, -1, -1};
21343 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21344 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21345 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21346 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21348 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21349 -1, 12, -1, 13, -1, 14, -1, 15};
21350 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21351 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21352 AHi = DAG.getBitcast(ExVT, AHi);
21353 BHi = DAG.getBitcast(ExVT, BHi);
21354 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21355 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21358 // Multiply, mask the lower 8bits of the lo/hi results and pack
21359 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21360 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21361 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21362 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21363 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21366 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21367 if (VT == MVT::v4i32) {
21368 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21369 "Should not custom lower when pmuldq is available!");
21371 // Extract the odd parts.
21372 static const int UnpackMask[] = { 1, -1, 3, -1 };
21373 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21374 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21376 // Multiply the even parts.
21377 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21378 // Now multiply odd parts.
21379 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21381 Evens = DAG.getBitcast(VT, Evens);
21382 Odds = DAG.getBitcast(VT, Odds);
21384 // Merge the two vectors back together with a shuffle. This expands into 2
21386 static const int ShufMask[] = { 0, 4, 2, 6 };
21387 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21390 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21391 "Only know how to lower V2I64/V4I64/V8I64 multiply");
21393 // 32-bit vector types used for MULDQ/MULUDQ.
21394 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21396 // MULDQ returns the 64-bit result of the signed multiplication of the lower
21397 // 32-bits. We can lower with this if the sign bits stretch that far.
21398 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21399 DAG.ComputeNumSignBits(B) > 32) {
21400 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21401 DAG.getBitcast(MulVT, B));
21404 // Ahi = psrlqi(a, 32);
21405 // Bhi = psrlqi(b, 32);
21407 // AloBlo = pmuludq(a, b);
21408 // AloBhi = pmuludq(a, Bhi);
21409 // AhiBlo = pmuludq(Ahi, b);
21411 // Hi = psllqi(AloBhi + AhiBlo, 32);
21412 // return AloBlo + Hi;
21413 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21414 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21415 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21417 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21418 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21419 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21421 // Bit cast to 32-bit vectors for MULUDQ.
21422 SDValue Alo = DAG.getBitcast(MulVT, A);
21423 SDValue Blo = DAG.getBitcast(MulVT, B);
21425 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21427 // Only multiply lo/hi halves that aren't known to be zero.
21428 SDValue AloBlo = Zero;
21429 if (!ALoIsZero && !BLoIsZero)
21430 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21432 SDValue AloBhi = Zero;
21433 if (!ALoIsZero && !BHiIsZero) {
21434 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21435 Bhi = DAG.getBitcast(MulVT, Bhi);
21436 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21439 SDValue AhiBlo = Zero;
21440 if (!AHiIsZero && !BLoIsZero) {
21441 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21442 Ahi = DAG.getBitcast(MulVT, Ahi);
21443 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21446 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21447 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21449 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21452 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21453 SelectionDAG &DAG) {
21455 MVT VT = Op.getSimpleValueType();
21457 // Decompose 256-bit ops into smaller 128-bit ops.
21458 if (VT.is256BitVector() && !Subtarget.hasInt256())
21459 return Lower256IntArith(Op, DAG);
21461 // Only i8 vectors should need custom lowering after this.
21462 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21463 "Unsupported vector type");
21465 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21466 // logical shift down the upper half and pack back to i8.
21467 SDValue A = Op.getOperand(0);
21468 SDValue B = Op.getOperand(1);
21470 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21471 // and then ashr/lshr the upper bits down to the lower bits before multiply.
21472 unsigned Opcode = Op.getOpcode();
21473 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21474 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21476 // AVX2 implementations - extend xmm subvectors to ymm.
21477 if (Subtarget.hasInt256()) {
21478 SDValue Lo = DAG.getIntPtrConstant(0, dl);
21479 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21481 if (VT == MVT::v32i8) {
21482 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21483 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21484 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21485 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21486 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21487 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21488 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21489 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21490 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21491 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21492 DAG.getConstant(8, dl, MVT::v16i16));
21493 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21494 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21495 DAG.getConstant(8, dl, MVT::v16i16));
21496 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21497 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21498 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
21499 16, 17, 18, 19, 20, 21, 22, 23};
21500 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21501 24, 25, 26, 27, 28, 29, 30, 31};
21502 return DAG.getNode(X86ISD::PACKUS, dl, VT,
21503 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21504 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21507 SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21508 SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21509 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21510 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21511 DAG.getConstant(8, dl, MVT::v16i16));
21512 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21513 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21514 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21517 assert(VT == MVT::v16i8 &&
21518 "Pre-AVX2 support only supports v16i8 multiplication");
21519 MVT ExVT = MVT::v8i16;
21521 // Extract the lo parts and zero/sign extend to i16.
21523 if (Subtarget.hasSSE41()) {
21524 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21525 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21527 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21528 -1, 4, -1, 5, -1, 6, -1, 7};
21529 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21530 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21531 ALo = DAG.getBitcast(ExVT, ALo);
21532 BLo = DAG.getBitcast(ExVT, BLo);
21533 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21534 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21537 // Extract the hi parts and zero/sign extend to i16.
21539 if (Subtarget.hasSSE41()) {
21540 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21541 -1, -1, -1, -1, -1, -1, -1, -1};
21542 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21543 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21544 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21545 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21547 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21548 -1, 12, -1, 13, -1, 14, -1, 15};
21549 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21550 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21551 AHi = DAG.getBitcast(ExVT, AHi);
21552 BHi = DAG.getBitcast(ExVT, BHi);
21553 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21554 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21557 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21558 // pack back to v16i8.
21559 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21560 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21561 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21562 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21563 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21566 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21567 assert(Subtarget.isTargetWin64() && "Unexpected target");
21568 EVT VT = Op.getValueType();
21569 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21570 "Unexpected return type for lowering");
21574 switch (Op->getOpcode()) {
21575 default: llvm_unreachable("Unexpected request for libcall!");
21576 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
21577 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
21578 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
21579 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
21580 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
21581 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21585 SDValue InChain = DAG.getEntryNode();
21587 TargetLowering::ArgListTy Args;
21588 TargetLowering::ArgListEntry Entry;
21589 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21590 EVT ArgVT = Op->getOperand(i).getValueType();
21591 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21592 "Unexpected argument type for lowering");
21593 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21594 Entry.Node = StackPtr;
21595 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21596 MachinePointerInfo(), /* Alignment = */ 16);
21597 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21598 Entry.Ty = PointerType::get(ArgTy,0);
21599 Entry.IsSExt = false;
21600 Entry.IsZExt = false;
21601 Args.push_back(Entry);
21604 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21605 getPointerTy(DAG.getDataLayout()));
21607 TargetLowering::CallLoweringInfo CLI(DAG);
21608 CLI.setDebugLoc(dl)
21611 getLibcallCallingConv(LC),
21612 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21615 .setSExtResult(isSigned)
21616 .setZExtResult(!isSigned);
21618 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21619 return DAG.getBitcast(VT, CallInfo.first);
21622 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21623 SelectionDAG &DAG) {
21624 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21625 MVT VT = Op0.getSimpleValueType();
21628 // Decompose 256-bit ops into smaller 128-bit ops.
21629 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21630 unsigned Opcode = Op.getOpcode();
21631 unsigned NumElems = VT.getVectorNumElements();
21632 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21633 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21634 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21635 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21636 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21637 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21638 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21640 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21641 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21643 return DAG.getMergeValues(Ops, dl);
21646 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21647 (VT == MVT::v8i32 && Subtarget.hasInt256()));
21649 // PMULxD operations multiply each even value (starting at 0) of LHS with
21650 // the related value of RHS and produce a widen result.
21651 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21652 // => <2 x i64> <ae|cg>
21654 // In other word, to have all the results, we need to perform two PMULxD:
21655 // 1. one with the even values.
21656 // 2. one with the odd values.
21657 // To achieve #2, with need to place the odd values at an even position.
21659 // Place the odd value at an even position (basically, shift all values 1
21660 // step to the left):
21661 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21662 // <a|b|c|d> => <b|undef|d|undef>
21663 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21664 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21665 // <e|f|g|h> => <f|undef|h|undef>
21666 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21667 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21669 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21671 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21672 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21674 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21675 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21676 // => <2 x i64> <ae|cg>
21677 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21678 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21679 // => <2 x i64> <bf|dh>
21680 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21682 // Shuffle it back into the right order.
21683 SDValue Highs, Lows;
21684 if (VT == MVT::v8i32) {
21685 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21686 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21687 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21688 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21690 const int HighMask[] = {1, 5, 3, 7};
21691 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21692 const int LowMask[] = {0, 4, 2, 6};
21693 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21696 // If we have a signed multiply but no PMULDQ fix up the high parts of a
21697 // unsigned multiply.
21698 if (IsSigned && !Subtarget.hasSSE41()) {
21699 SDValue ShAmt = DAG.getConstant(
21701 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21702 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21703 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21704 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21705 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21707 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21708 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21711 // The first result of MUL_LOHI is actually the low value, followed by the
21713 SDValue Ops[] = {Lows, Highs};
21714 return DAG.getMergeValues(Ops, dl);
21717 // Return true if the required (according to Opcode) shift-imm form is natively
21718 // supported by the Subtarget
21719 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21721 if (VT.getScalarSizeInBits() < 16)
21724 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21725 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21728 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
21729 (VT.is256BitVector() && Subtarget.hasInt256());
21731 bool AShift = LShift && (Subtarget.hasAVX512() ||
21732 (VT != MVT::v2i64 && VT != MVT::v4i64));
21733 return (Opcode == ISD::SRA) ? AShift : LShift;
21736 // The shift amount is a variable, but it is the same for all vector lanes.
21737 // These instructions are defined together with shift-immediate.
21739 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21741 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21744 // Return true if the required (according to Opcode) variable-shift form is
21745 // natively supported by the Subtarget
21746 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21749 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21752 // vXi16 supported only on AVX-512, BWI
21753 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21756 if (Subtarget.hasAVX512())
21759 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21760 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21761 return (Opcode == ISD::SRA) ? AShift : LShift;
21764 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21765 const X86Subtarget &Subtarget) {
21766 MVT VT = Op.getSimpleValueType();
21768 SDValue R = Op.getOperand(0);
21769 SDValue Amt = Op.getOperand(1);
21771 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21772 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21774 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21775 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21776 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21777 SDValue Ex = DAG.getBitcast(ExVT, R);
21779 // ashr(R, 63) === cmp_slt(R, 0)
21780 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
21781 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
21782 "Unsupported PCMPGT op");
21783 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
21784 getZeroVector(VT, Subtarget, DAG, dl), R);
21787 if (ShiftAmt >= 32) {
21788 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21790 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21791 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21792 ShiftAmt - 32, DAG);
21793 if (VT == MVT::v2i64)
21794 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21795 if (VT == MVT::v4i64)
21796 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21797 {9, 1, 11, 3, 13, 5, 15, 7});
21799 // SRA upper i32, SHL whole i64 and select lower i32.
21800 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21803 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21804 Lower = DAG.getBitcast(ExVT, Lower);
21805 if (VT == MVT::v2i64)
21806 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21807 if (VT == MVT::v4i64)
21808 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21809 {8, 1, 10, 3, 12, 5, 14, 7});
21811 return DAG.getBitcast(VT, Ex);
21814 // Optimize shl/srl/sra with constant shift amount.
21815 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21816 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21817 uint64_t ShiftAmt = ShiftConst->getZExtValue();
21819 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21820 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21822 // i64 SRA needs to be performed as partial shifts.
21823 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21824 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21825 return ArithmeticShiftRight64(ShiftAmt);
21827 if (VT == MVT::v16i8 ||
21828 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
21829 VT == MVT::v64i8) {
21830 unsigned NumElts = VT.getVectorNumElements();
21831 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21833 // Simple i8 add case
21834 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21835 return DAG.getNode(ISD::ADD, dl, VT, R, R);
21837 // ashr(R, 7) === cmp_slt(R, 0)
21838 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21839 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21840 if (VT.is512BitVector()) {
21841 assert(VT == MVT::v64i8 && "Unexpected element type!");
21842 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21843 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21845 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21848 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21849 if (VT == MVT::v16i8 && Subtarget.hasXOP())
21852 if (Op.getOpcode() == ISD::SHL) {
21853 // Make a large shift.
21854 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
21856 SHL = DAG.getBitcast(VT, SHL);
21857 // Zero out the rightmost bits.
21858 return DAG.getNode(ISD::AND, dl, VT, SHL,
21859 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
21861 if (Op.getOpcode() == ISD::SRL) {
21862 // Make a large shift.
21863 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
21865 SRL = DAG.getBitcast(VT, SRL);
21866 // Zero out the leftmost bits.
21867 return DAG.getNode(ISD::AND, dl, VT, SRL,
21868 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
21870 if (Op.getOpcode() == ISD::SRA) {
21871 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
21872 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21874 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
21875 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
21876 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
21879 llvm_unreachable("Unknown shift opcode.");
21884 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21885 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
21886 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
21887 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
21888 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
21890 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
21891 unsigned SubVectorScale = 1;
21892 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
21894 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
21895 Amt = Amt.getOperand(0);
21898 // Peek through any splat that was introduced for i64 shift vectorization.
21899 int SplatIndex = -1;
21900 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
21901 if (SVN->isSplat()) {
21902 SplatIndex = SVN->getSplatIndex();
21903 Amt = Amt.getOperand(0);
21904 assert(SplatIndex < (int)VT.getVectorNumElements() &&
21905 "Splat shuffle referencing second operand");
21908 if (Amt.getOpcode() != ISD::BITCAST ||
21909 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
21912 Amt = Amt.getOperand(0);
21913 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21914 (SubVectorScale * VT.getVectorNumElements());
21915 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
21916 uint64_t ShiftAmt = 0;
21917 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
21918 for (unsigned i = 0; i != Ratio; ++i) {
21919 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
21923 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
21926 // Check remaining shift amounts (if not a splat).
21927 if (SplatIndex < 0) {
21928 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21929 uint64_t ShAmt = 0;
21930 for (unsigned j = 0; j != Ratio; ++j) {
21931 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
21935 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
21937 if (ShAmt != ShiftAmt)
21942 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21943 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21945 if (Op.getOpcode() == ISD::SRA)
21946 return ArithmeticShiftRight64(ShiftAmt);
21952 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
21953 const X86Subtarget &Subtarget) {
21954 MVT VT = Op.getSimpleValueType();
21956 SDValue R = Op.getOperand(0);
21957 SDValue Amt = Op.getOperand(1);
21959 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21960 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21962 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
21963 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
21965 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
21967 MVT EltVT = VT.getVectorElementType();
21969 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
21970 // Check if this build_vector node is doing a splat.
21971 // If so, then set BaseShAmt equal to the splat value.
21972 BaseShAmt = BV->getSplatValue();
21973 if (BaseShAmt && BaseShAmt.isUndef())
21974 BaseShAmt = SDValue();
21976 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
21977 Amt = Amt.getOperand(0);
21979 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
21980 if (SVN && SVN->isSplat()) {
21981 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
21982 SDValue InVec = Amt.getOperand(0);
21983 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
21984 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
21985 "Unexpected shuffle index found!");
21986 BaseShAmt = InVec.getOperand(SplatIdx);
21987 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
21988 if (ConstantSDNode *C =
21989 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
21990 if (C->getZExtValue() == SplatIdx)
21991 BaseShAmt = InVec.getOperand(1);
21996 // Avoid introducing an extract element from a shuffle.
21997 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
21998 DAG.getIntPtrConstant(SplatIdx, dl));
22002 if (BaseShAmt.getNode()) {
22003 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22004 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22005 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22006 else if (EltVT.bitsLT(MVT::i32))
22007 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22009 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22013 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22014 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
22015 Amt.getOpcode() == ISD::BITCAST &&
22016 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22017 Amt = Amt.getOperand(0);
22018 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22019 VT.getVectorNumElements();
22020 std::vector<SDValue> Vals(Ratio);
22021 for (unsigned i = 0; i != Ratio; ++i)
22022 Vals[i] = Amt.getOperand(i);
22023 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22024 for (unsigned j = 0; j != Ratio; ++j)
22025 if (Vals[j] != Amt.getOperand(i + j))
22029 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22030 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22035 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22036 SelectionDAG &DAG) {
22037 MVT VT = Op.getSimpleValueType();
22039 SDValue R = Op.getOperand(0);
22040 SDValue Amt = Op.getOperand(1);
22041 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22043 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22044 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22046 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22049 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22052 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22055 // XOP has 128-bit variable logical/arithmetic shifts.
22056 // +ve/-ve Amt = shift left/right.
22057 if (Subtarget.hasXOP() &&
22058 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22059 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22060 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22061 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22062 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22064 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22065 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22066 if (Op.getOpcode() == ISD::SRA)
22067 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22070 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22071 // shifts per-lane and then shuffle the partial results back together.
22072 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22073 // Splat the shift amounts so the scalar shifts above will catch it.
22074 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22075 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22076 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22077 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22078 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22081 // i64 vector arithmetic shift can be emulated with the transform:
22082 // M = lshr(SIGN_MASK, Amt)
22083 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22084 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22085 Op.getOpcode() == ISD::SRA) {
22086 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22087 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22088 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22089 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22090 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22094 // If possible, lower this packed shift into a vector multiply instead of
22095 // expanding it into a sequence of scalar shifts.
22096 // Do this only if the vector shift count is a constant build_vector.
22097 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22098 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22099 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22100 SmallVector<SDValue, 8> Elts;
22101 MVT SVT = VT.getVectorElementType();
22102 unsigned SVTBits = SVT.getSizeInBits();
22103 APInt One(SVTBits, 1);
22104 unsigned NumElems = VT.getVectorNumElements();
22106 for (unsigned i=0; i !=NumElems; ++i) {
22107 SDValue Op = Amt->getOperand(i);
22108 if (Op->isUndef()) {
22109 Elts.push_back(Op);
22113 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22114 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22115 uint64_t ShAmt = C.getZExtValue();
22116 if (ShAmt >= SVTBits) {
22117 Elts.push_back(DAG.getUNDEF(SVT));
22120 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22122 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22123 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22126 // Lower SHL with variable shift amount.
22127 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22128 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22130 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22131 DAG.getConstant(0x3f800000U, dl, VT));
22132 Op = DAG.getBitcast(MVT::v4f32, Op);
22133 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22134 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22137 // If possible, lower this shift as a sequence of two shifts by
22138 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22140 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22142 // Could be rewritten as:
22143 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22145 // The advantage is that the two shifts from the example would be
22146 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22147 // the vector shift into four scalar shifts plus four pairs of vector
22149 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22150 unsigned TargetOpcode = X86ISD::MOVSS;
22151 bool CanBeSimplified;
22152 // The splat value for the first packed shift (the 'X' from the example).
22153 SDValue Amt1 = Amt->getOperand(0);
22154 // The splat value for the second packed shift (the 'Y' from the example).
22155 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22157 // See if it is possible to replace this node with a sequence of
22158 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22159 if (VT == MVT::v4i32) {
22160 // Check if it is legal to use a MOVSS.
22161 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22162 Amt2 == Amt->getOperand(3);
22163 if (!CanBeSimplified) {
22164 // Otherwise, check if we can still simplify this node using a MOVSD.
22165 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22166 Amt->getOperand(2) == Amt->getOperand(3);
22167 TargetOpcode = X86ISD::MOVSD;
22168 Amt2 = Amt->getOperand(2);
22171 // Do similar checks for the case where the machine value type
22173 CanBeSimplified = Amt1 == Amt->getOperand(1);
22174 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22175 CanBeSimplified = Amt2 == Amt->getOperand(i);
22177 if (!CanBeSimplified) {
22178 TargetOpcode = X86ISD::MOVSD;
22179 CanBeSimplified = true;
22180 Amt2 = Amt->getOperand(4);
22181 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22182 CanBeSimplified = Amt1 == Amt->getOperand(i);
22183 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22184 CanBeSimplified = Amt2 == Amt->getOperand(j);
22188 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22189 isa<ConstantSDNode>(Amt2)) {
22190 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22191 MVT CastVT = MVT::v4i32;
22193 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22194 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22196 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22197 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22198 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22199 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22200 if (TargetOpcode == X86ISD::MOVSD)
22201 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22202 BitCast2, {0, 1, 6, 7}));
22203 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22204 BitCast2, {0, 5, 6, 7}));
22208 // v4i32 Non Uniform Shifts.
22209 // If the shift amount is constant we can shift each lane using the SSE2
22210 // immediate shifts, else we need to zero-extend each lane to the lower i64
22211 // and shift using the SSE2 variable shifts.
22212 // The separate results can then be blended together.
22213 if (VT == MVT::v4i32) {
22214 unsigned Opc = Op.getOpcode();
22215 SDValue Amt0, Amt1, Amt2, Amt3;
22217 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22218 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22219 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22220 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22222 // ISD::SHL is handled above but we include it here for completeness.
22225 llvm_unreachable("Unknown target vector shift node");
22227 Opc = X86ISD::VSHL;
22230 Opc = X86ISD::VSRL;
22233 Opc = X86ISD::VSRA;
22236 // The SSE2 shifts use the lower i64 as the same shift amount for
22237 // all lanes and the upper i64 is ignored. These shuffle masks
22238 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22239 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22240 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22241 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22242 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22243 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22246 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22247 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22248 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22249 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22250 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22251 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22252 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22255 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22256 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22257 // make the existing SSE solution better.
22258 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22259 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22260 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22261 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22262 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22263 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22265 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22266 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22267 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22268 return DAG.getNode(ISD::TRUNCATE, dl, VT,
22269 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22272 if (VT == MVT::v16i8 ||
22273 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22274 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22275 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22276 unsigned ShiftOpcode = Op->getOpcode();
22278 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22279 if (VT.is512BitVector()) {
22280 // On AVX512BW targets we make use of the fact that VSELECT lowers
22281 // to a masked blend which selects bytes based just on the sign bit
22282 // extracted to a mask.
22283 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22284 V0 = DAG.getBitcast(VT, V0);
22285 V1 = DAG.getBitcast(VT, V1);
22286 Sel = DAG.getBitcast(VT, Sel);
22287 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22288 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22289 } else if (Subtarget.hasSSE41()) {
22290 // On SSE41 targets we make use of the fact that VSELECT lowers
22291 // to PBLENDVB which selects bytes based just on the sign bit.
22292 V0 = DAG.getBitcast(VT, V0);
22293 V1 = DAG.getBitcast(VT, V1);
22294 Sel = DAG.getBitcast(VT, Sel);
22295 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22297 // On pre-SSE41 targets we test for the sign bit by comparing to
22298 // zero - a negative value will set all bits of the lanes to true
22299 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22300 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22301 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22302 return DAG.getSelect(dl, SelVT, C, V0, V1);
22305 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22306 // We can safely do this using i16 shifts as we're only interested in
22307 // the 3 lower bits of each byte.
22308 Amt = DAG.getBitcast(ExtVT, Amt);
22309 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22310 Amt = DAG.getBitcast(VT, Amt);
22312 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22313 // r = VSELECT(r, shift(r, 4), a);
22315 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22316 R = SignBitSelect(VT, Amt, M, R);
22319 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22321 // r = VSELECT(r, shift(r, 2), a);
22322 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22323 R = SignBitSelect(VT, Amt, M, R);
22326 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22328 // return VSELECT(r, shift(r, 1), a);
22329 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22330 R = SignBitSelect(VT, Amt, M, R);
22334 if (Op->getOpcode() == ISD::SRA) {
22335 // For SRA we need to unpack each byte to the higher byte of a i16 vector
22336 // so we can correctly sign extend. We don't care what happens to the
22338 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22339 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22340 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22341 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22342 ALo = DAG.getBitcast(ExtVT, ALo);
22343 AHi = DAG.getBitcast(ExtVT, AHi);
22344 RLo = DAG.getBitcast(ExtVT, RLo);
22345 RHi = DAG.getBitcast(ExtVT, RHi);
22347 // r = VSELECT(r, shift(r, 4), a);
22348 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22349 DAG.getConstant(4, dl, ExtVT));
22350 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22351 DAG.getConstant(4, dl, ExtVT));
22352 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22353 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22356 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22357 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22359 // r = VSELECT(r, shift(r, 2), a);
22360 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22361 DAG.getConstant(2, dl, ExtVT));
22362 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22363 DAG.getConstant(2, dl, ExtVT));
22364 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22365 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22368 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22369 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22371 // r = VSELECT(r, shift(r, 1), a);
22372 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22373 DAG.getConstant(1, dl, ExtVT));
22374 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22375 DAG.getConstant(1, dl, ExtVT));
22376 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22377 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22379 // Logical shift the result back to the lower byte, leaving a zero upper
22381 // meaning that we can safely pack with PACKUSWB.
22383 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22385 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22386 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22390 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22391 MVT ExtVT = MVT::v8i32;
22392 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22393 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22394 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22395 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22396 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22397 ALo = DAG.getBitcast(ExtVT, ALo);
22398 AHi = DAG.getBitcast(ExtVT, AHi);
22399 RLo = DAG.getBitcast(ExtVT, RLo);
22400 RHi = DAG.getBitcast(ExtVT, RHi);
22401 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22402 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22403 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22404 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22405 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22408 if (VT == MVT::v8i16) {
22409 unsigned ShiftOpcode = Op->getOpcode();
22411 // If we have a constant shift amount, the non-SSE41 path is best as
22412 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22413 bool UseSSE41 = Subtarget.hasSSE41() &&
22414 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22416 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22417 // On SSE41 targets we make use of the fact that VSELECT lowers
22418 // to PBLENDVB which selects bytes based just on the sign bit.
22420 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22421 V0 = DAG.getBitcast(ExtVT, V0);
22422 V1 = DAG.getBitcast(ExtVT, V1);
22423 Sel = DAG.getBitcast(ExtVT, Sel);
22424 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
22426 // On pre-SSE41 targets we splat the sign bit - a negative value will
22427 // set all bits of the lanes to true and VSELECT uses that in
22428 // its OR(AND(V0,C),AND(V1,~C)) lowering.
22430 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22431 return DAG.getSelect(dl, VT, C, V0, V1);
22434 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22436 // On SSE41 targets we need to replicate the shift mask in both
22437 // bytes for PBLENDVB.
22440 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22441 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22443 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22446 // r = VSELECT(r, shift(r, 8), a);
22447 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22448 R = SignBitSelect(Amt, M, R);
22451 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22453 // r = VSELECT(r, shift(r, 4), a);
22454 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22455 R = SignBitSelect(Amt, M, R);
22458 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22460 // r = VSELECT(r, shift(r, 2), a);
22461 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22462 R = SignBitSelect(Amt, M, R);
22465 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22467 // return VSELECT(r, shift(r, 1), a);
22468 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22469 R = SignBitSelect(Amt, M, R);
22473 // Decompose 256-bit shifts into smaller 128-bit shifts.
22474 if (VT.is256BitVector())
22475 return Lower256IntArith(Op, DAG);
22480 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22481 SelectionDAG &DAG) {
22482 MVT VT = Op.getSimpleValueType();
22484 SDValue R = Op.getOperand(0);
22485 SDValue Amt = Op.getOperand(1);
22487 assert(VT.isVector() && "Custom lowering only for vector rotates!");
22488 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22489 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
22491 // XOP has 128-bit vector variable + immediate rotates.
22492 // +ve/-ve Amt = rotate left/right.
22494 // Split 256-bit integers.
22495 if (VT.is256BitVector())
22496 return Lower256IntArith(Op, DAG);
22498 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22500 // Attempt to rotate by immediate.
22501 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22502 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22503 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22504 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
22505 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22506 DAG.getConstant(RotateAmt, DL, MVT::i8));
22510 // Use general rotate by variable (per-element).
22511 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22514 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22515 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22516 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22517 // looks for this combo and may remove the "setcc" instruction if the "setcc"
22518 // has only one use.
22519 SDNode *N = Op.getNode();
22520 SDValue LHS = N->getOperand(0);
22521 SDValue RHS = N->getOperand(1);
22522 unsigned BaseOp = 0;
22523 X86::CondCode Cond;
22525 switch (Op.getOpcode()) {
22526 default: llvm_unreachable("Unknown ovf instruction!");
22528 // A subtract of one will be selected as a INC. Note that INC doesn't
22529 // set CF, so we can't do this for UADDO.
22530 if (isOneConstant(RHS)) {
22531 BaseOp = X86ISD::INC;
22532 Cond = X86::COND_O;
22535 BaseOp = X86ISD::ADD;
22536 Cond = X86::COND_O;
22539 BaseOp = X86ISD::ADD;
22540 Cond = X86::COND_B;
22543 // A subtract of one will be selected as a DEC. Note that DEC doesn't
22544 // set CF, so we can't do this for USUBO.
22545 if (isOneConstant(RHS)) {
22546 BaseOp = X86ISD::DEC;
22547 Cond = X86::COND_O;
22550 BaseOp = X86ISD::SUB;
22551 Cond = X86::COND_O;
22554 BaseOp = X86ISD::SUB;
22555 Cond = X86::COND_B;
22558 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22559 Cond = X86::COND_O;
22561 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22562 if (N->getValueType(0) == MVT::i8) {
22563 BaseOp = X86ISD::UMUL8;
22564 Cond = X86::COND_O;
22567 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22569 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22571 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22573 if (N->getValueType(1) == MVT::i1)
22574 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22576 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22580 // Also sets EFLAGS.
22581 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22582 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22584 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22586 if (N->getValueType(1) == MVT::i1)
22587 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22589 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22592 /// Returns true if the operand type is exactly twice the native width, and
22593 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22594 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22595 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22596 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22597 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22600 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22601 else if (OpWidth == 128)
22602 return Subtarget.hasCmpxchg16b();
22607 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22608 return needsCmpXchgNb(SI->getValueOperand()->getType());
22611 // Note: this turns large loads into lock cmpxchg8b/16b.
22612 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22613 TargetLowering::AtomicExpansionKind
22614 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22615 auto PTy = cast<PointerType>(LI->getPointerOperandType());
22616 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22617 : AtomicExpansionKind::None;
22620 TargetLowering::AtomicExpansionKind
22621 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22622 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22623 Type *MemType = AI->getType();
22625 // If the operand is too big, we must see if cmpxchg8/16b is available
22626 // and default to library calls otherwise.
22627 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22628 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22629 : AtomicExpansionKind::None;
22632 AtomicRMWInst::BinOp Op = AI->getOperation();
22635 llvm_unreachable("Unknown atomic operation");
22636 case AtomicRMWInst::Xchg:
22637 case AtomicRMWInst::Add:
22638 case AtomicRMWInst::Sub:
22639 // It's better to use xadd, xsub or xchg for these in all cases.
22640 return AtomicExpansionKind::None;
22641 case AtomicRMWInst::Or:
22642 case AtomicRMWInst::And:
22643 case AtomicRMWInst::Xor:
22644 // If the atomicrmw's result isn't actually used, we can just add a "lock"
22645 // prefix to a normal instruction for these operations.
22646 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22647 : AtomicExpansionKind::None;
22648 case AtomicRMWInst::Nand:
22649 case AtomicRMWInst::Max:
22650 case AtomicRMWInst::Min:
22651 case AtomicRMWInst::UMax:
22652 case AtomicRMWInst::UMin:
22653 // These always require a non-trivial set of data operations on x86. We must
22654 // use a cmpxchg loop.
22655 return AtomicExpansionKind::CmpXChg;
22660 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22661 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22662 Type *MemType = AI->getType();
22663 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22664 // there is no benefit in turning such RMWs into loads, and it is actually
22665 // harmful as it introduces a mfence.
22666 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22669 auto Builder = IRBuilder<>(AI);
22670 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22671 auto SynchScope = AI->getSynchScope();
22672 // We must restrict the ordering to avoid generating loads with Release or
22673 // ReleaseAcquire orderings.
22674 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22675 auto Ptr = AI->getPointerOperand();
22677 // Before the load we need a fence. Here is an example lifted from
22678 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22681 // x.store(1, relaxed);
22682 // r1 = y.fetch_add(0, release);
22684 // y.fetch_add(42, acquire);
22685 // r2 = x.load(relaxed);
22686 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22687 // lowered to just a load without a fence. A mfence flushes the store buffer,
22688 // making the optimization clearly correct.
22689 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22690 // otherwise, we might be able to be more aggressive on relaxed idempotent
22691 // rmw. In practice, they do not look useful, so we don't try to be
22692 // especially clever.
22693 if (SynchScope == SingleThread)
22694 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22695 // the IR level, so we must wrap it in an intrinsic.
22698 if (!Subtarget.hasMFence())
22699 // FIXME: it might make sense to use a locked operation here but on a
22700 // different cache-line to prevent cache-line bouncing. In practice it
22701 // is probably a small win, and x86 processors without mfence are rare
22702 // enough that we do not bother.
22706 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22707 Builder.CreateCall(MFence, {});
22709 // Finally we can emit the atomic load.
22710 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22711 AI->getType()->getPrimitiveSizeInBits());
22712 Loaded->setAtomic(Order, SynchScope);
22713 AI->replaceAllUsesWith(Loaded);
22714 AI->eraseFromParent();
22718 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22719 SelectionDAG &DAG) {
22721 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22722 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22723 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
22724 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22726 // The only fence that needs an instruction is a sequentially-consistent
22727 // cross-thread fence.
22728 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22729 FenceScope == CrossThread) {
22730 if (Subtarget.hasMFence())
22731 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22733 SDValue Chain = Op.getOperand(0);
22734 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22736 DAG.getRegister(X86::ESP, MVT::i32), // Base
22737 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
22738 DAG.getRegister(0, MVT::i32), // Index
22739 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
22740 DAG.getRegister(0, MVT::i32), // Segment.
22744 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22745 return SDValue(Res, 0);
22748 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22749 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22752 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22753 SelectionDAG &DAG) {
22754 MVT T = Op.getSimpleValueType();
22758 switch(T.SimpleTy) {
22759 default: llvm_unreachable("Invalid value type!");
22760 case MVT::i8: Reg = X86::AL; size = 1; break;
22761 case MVT::i16: Reg = X86::AX; size = 2; break;
22762 case MVT::i32: Reg = X86::EAX; size = 4; break;
22764 assert(Subtarget.is64Bit() && "Node not type legal!");
22765 Reg = X86::RAX; size = 8;
22768 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22769 Op.getOperand(2), SDValue());
22770 SDValue Ops[] = { cpIn.getValue(0),
22773 DAG.getTargetConstant(size, DL, MVT::i8),
22774 cpIn.getValue(1) };
22775 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22776 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22777 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22781 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22782 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22783 MVT::i32, cpOut.getValue(2));
22784 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22786 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22787 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22788 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22792 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22793 SelectionDAG &DAG) {
22794 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22795 MVT DstVT = Op.getSimpleValueType();
22797 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
22798 SrcVT == MVT::i64) {
22799 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22800 if (DstVT != MVT::f64)
22801 // This conversion needs to be expanded.
22804 SDValue Op0 = Op->getOperand(0);
22805 SmallVector<SDValue, 16> Elts;
22809 if (SrcVT.isVector()) {
22810 NumElts = SrcVT.getVectorNumElements();
22811 SVT = SrcVT.getVectorElementType();
22813 // Widen the vector in input in the case of MVT::v2i32.
22814 // Example: from MVT::v2i32 to MVT::v4i32.
22815 for (unsigned i = 0, e = NumElts; i != e; ++i)
22816 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22817 DAG.getIntPtrConstant(i, dl)));
22819 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
22820 "Unexpected source type in LowerBITCAST");
22821 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22822 DAG.getIntPtrConstant(0, dl)));
22823 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22824 DAG.getIntPtrConstant(1, dl)));
22828 // Explicitly mark the extra elements as Undef.
22829 Elts.append(NumElts, DAG.getUNDEF(SVT));
22831 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22832 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22833 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22834 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22835 DAG.getIntPtrConstant(0, dl));
22838 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
22839 Subtarget.hasMMX() && "Unexpected custom BITCAST");
22840 assert((DstVT == MVT::i64 ||
22841 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
22842 "Unexpected custom BITCAST");
22843 // i64 <=> MMX conversions are Legal.
22844 if (SrcVT==MVT::i64 && DstVT.isVector())
22846 if (DstVT==MVT::i64 && SrcVT.isVector())
22848 // MMX <=> MMX conversions are Legal.
22849 if (SrcVT.isVector() && DstVT.isVector())
22851 // All other conversions need to be expanded.
22855 /// Compute the horizontal sum of bytes in V for the elements of VT.
22857 /// Requires V to be a byte vector and VT to be an integer vector type with
22858 /// wider elements than V's type. The width of the elements of VT determines
22859 /// how many bytes of V are summed horizontally to produce each element of the
22861 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
22862 const X86Subtarget &Subtarget,
22863 SelectionDAG &DAG) {
22865 MVT ByteVecVT = V.getSimpleValueType();
22866 MVT EltVT = VT.getVectorElementType();
22867 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
22868 "Expected value to have byte element type.");
22869 assert(EltVT != MVT::i8 &&
22870 "Horizontal byte sum only makes sense for wider elements!");
22871 unsigned VecSize = VT.getSizeInBits();
22872 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
22874 // PSADBW instruction horizontally add all bytes and leave the result in i64
22875 // chunks, thus directly computes the pop count for v2i64 and v4i64.
22876 if (EltVT == MVT::i64) {
22877 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22878 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22879 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
22880 return DAG.getBitcast(VT, V);
22883 if (EltVT == MVT::i32) {
22884 // We unpack the low half and high half into i32s interleaved with zeros so
22885 // that we can use PSADBW to horizontally sum them. The most useful part of
22886 // this is that it lines up the results of two PSADBW instructions to be
22887 // two v2i64 vectors which concatenated are the 4 population counts. We can
22888 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
22889 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
22890 SDValue V32 = DAG.getBitcast(VT, V);
22891 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
22892 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
22894 // Do the horizontal sums into two v2i64s.
22895 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22896 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22897 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22898 DAG.getBitcast(ByteVecVT, Low), Zeros);
22899 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22900 DAG.getBitcast(ByteVecVT, High), Zeros);
22902 // Merge them together.
22903 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
22904 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
22905 DAG.getBitcast(ShortVecVT, Low),
22906 DAG.getBitcast(ShortVecVT, High));
22908 return DAG.getBitcast(VT, V);
22911 // The only element type left is i16.
22912 assert(EltVT == MVT::i16 && "Unknown how to handle type");
22914 // To obtain pop count for each i16 element starting from the pop count for
22915 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
22916 // right by 8. It is important to shift as i16s as i8 vector shift isn't
22917 // directly supported.
22918 SDValue ShifterV = DAG.getConstant(8, DL, VT);
22919 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22920 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
22921 DAG.getBitcast(ByteVecVT, V));
22922 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22925 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
22926 const X86Subtarget &Subtarget,
22927 SelectionDAG &DAG) {
22928 MVT VT = Op.getSimpleValueType();
22929 MVT EltVT = VT.getVectorElementType();
22930 unsigned VecSize = VT.getSizeInBits();
22932 // Implement a lookup table in register by using an algorithm based on:
22933 // http://wm.ite.pl/articles/sse-popcount.html
22935 // The general idea is that every lower byte nibble in the input vector is an
22936 // index into a in-register pre-computed pop count table. We then split up the
22937 // input vector in two new ones: (1) a vector with only the shifted-right
22938 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
22939 // masked out higher ones) for each byte. PSHUFB is used separately with both
22940 // to index the in-register table. Next, both are added and the result is a
22941 // i8 vector where each element contains the pop count for input byte.
22943 // To obtain the pop count for elements != i8, we follow up with the same
22944 // approach and use additional tricks as described below.
22946 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
22947 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
22948 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
22949 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
22951 int NumByteElts = VecSize / 8;
22952 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
22953 SDValue In = DAG.getBitcast(ByteVecVT, Op);
22954 SmallVector<SDValue, 64> LUTVec;
22955 for (int i = 0; i < NumByteElts; ++i)
22956 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22957 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
22958 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
22961 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
22962 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
22965 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
22967 // The input vector is used as the shuffle mask that index elements into the
22968 // LUT. After counting low and high nibbles, add the vector to obtain the
22969 // final pop count per i8 element.
22970 SDValue HighPopCnt =
22971 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
22972 SDValue LowPopCnt =
22973 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
22974 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
22976 if (EltVT == MVT::i8)
22979 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
22982 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
22983 const X86Subtarget &Subtarget,
22984 SelectionDAG &DAG) {
22985 MVT VT = Op.getSimpleValueType();
22986 assert(VT.is128BitVector() &&
22987 "Only 128-bit vector bitmath lowering supported.");
22989 int VecSize = VT.getSizeInBits();
22990 MVT EltVT = VT.getVectorElementType();
22991 int Len = EltVT.getSizeInBits();
22993 // This is the vectorized version of the "best" algorithm from
22994 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
22995 // with a minor tweak to use a series of adds + shifts instead of vector
22996 // multiplications. Implemented for all integer vector types. We only use
22997 // this when we don't have SSSE3 which allows a LUT-based lowering that is
22998 // much faster, even faster than using native popcnt instructions.
23000 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23001 MVT VT = V.getSimpleValueType();
23002 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23003 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23005 auto GetMask = [&](SDValue V, APInt Mask) {
23006 MVT VT = V.getSimpleValueType();
23007 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23008 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23011 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23012 // x86, so set the SRL type to have elements at least i16 wide. This is
23013 // correct because all of our SRLs are followed immediately by a mask anyways
23014 // that handles any bits that sneak into the high bits of the byte elements.
23015 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23019 // v = v - ((v >> 1) & 0x55555555...)
23021 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23022 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23023 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23025 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23026 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23027 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23028 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23029 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23031 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23032 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23033 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23034 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23036 // At this point, V contains the byte-wise population count, and we are
23037 // merely doing a horizontal sum if necessary to get the wider element
23039 if (EltVT == MVT::i8)
23042 return LowerHorizontalByteSum(
23043 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23047 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23048 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23049 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23050 SelectionDAG &DAG) {
23051 MVT VT = Op.getSimpleValueType();
23052 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23053 "Unknown CTPOP type to handle");
23054 SDLoc DL(Op.getNode());
23055 SDValue Op0 = Op.getOperand(0);
23057 if (!Subtarget.hasSSSE3()) {
23058 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23059 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23060 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23063 // Decompose 256-bit ops into smaller 128-bit ops.
23064 if (VT.is256BitVector() && !Subtarget.hasInt256())
23065 return Lower256IntUnary(Op, DAG);
23067 // Decompose 512-bit ops into smaller 256-bit ops.
23068 if (VT.is512BitVector() && !Subtarget.hasBWI())
23069 return Lower512IntUnary(Op, DAG);
23071 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23074 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23075 SelectionDAG &DAG) {
23076 assert(Op.getSimpleValueType().isVector() &&
23077 "We only do custom lowering for vector population count.");
23078 return LowerVectorCTPOP(Op, Subtarget, DAG);
23081 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23082 MVT VT = Op.getSimpleValueType();
23083 SDValue In = Op.getOperand(0);
23086 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23087 // perform the BITREVERSE.
23088 if (!VT.isVector()) {
23089 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23090 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23091 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23092 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23093 DAG.getIntPtrConstant(0, DL));
23096 int NumElts = VT.getVectorNumElements();
23097 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23099 // Decompose 256-bit ops into smaller 128-bit ops.
23100 if (VT.is256BitVector())
23101 return Lower256IntUnary(Op, DAG);
23103 assert(VT.is128BitVector() &&
23104 "Only 128-bit vector bitreverse lowering supported.");
23106 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23107 // perform the BSWAP in the shuffle.
23108 // Its best to shuffle using the second operand as this will implicitly allow
23109 // memory folding for multiple vectors.
23110 SmallVector<SDValue, 16> MaskElts;
23111 for (int i = 0; i != NumElts; ++i) {
23112 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23113 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23114 int PermuteByte = SourceByte | (2 << 5);
23115 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23119 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23120 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23121 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23123 return DAG.getBitcast(VT, Res);
23126 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23127 SelectionDAG &DAG) {
23128 if (Subtarget.hasXOP())
23129 return LowerBITREVERSE_XOP(Op, DAG);
23131 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23133 MVT VT = Op.getSimpleValueType();
23134 SDValue In = Op.getOperand(0);
23137 unsigned NumElts = VT.getVectorNumElements();
23138 assert(VT.getScalarType() == MVT::i8 &&
23139 "Only byte vector BITREVERSE supported");
23141 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23142 if (VT.is256BitVector() && !Subtarget.hasInt256())
23143 return Lower256IntUnary(Op, DAG);
23145 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23146 // two nibbles and a PSHUFB lookup to find the bitreverse of each
23147 // 0-15 value (moved to the other nibble).
23148 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23149 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23150 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23152 const int LoLUT[16] = {
23153 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23154 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23155 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23156 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23157 const int HiLUT[16] = {
23158 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23159 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23160 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23161 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23163 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23164 for (unsigned i = 0; i < NumElts; ++i) {
23165 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23166 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23169 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23170 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23171 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23172 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23173 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23176 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23177 unsigned NewOpc = 0;
23178 switch (N->getOpcode()) {
23179 case ISD::ATOMIC_LOAD_ADD:
23180 NewOpc = X86ISD::LADD;
23182 case ISD::ATOMIC_LOAD_SUB:
23183 NewOpc = X86ISD::LSUB;
23185 case ISD::ATOMIC_LOAD_OR:
23186 NewOpc = X86ISD::LOR;
23188 case ISD::ATOMIC_LOAD_XOR:
23189 NewOpc = X86ISD::LXOR;
23191 case ISD::ATOMIC_LOAD_AND:
23192 NewOpc = X86ISD::LAND;
23195 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23198 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23199 return DAG.getMemIntrinsicNode(
23200 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23201 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23202 /*MemVT=*/N->getSimpleValueType(0), MMO);
23205 /// Lower atomic_load_ops into LOCK-prefixed operations.
23206 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23207 const X86Subtarget &Subtarget) {
23208 SDValue Chain = N->getOperand(0);
23209 SDValue LHS = N->getOperand(1);
23210 SDValue RHS = N->getOperand(2);
23211 unsigned Opc = N->getOpcode();
23212 MVT VT = N->getSimpleValueType(0);
23215 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23216 // can only be lowered when the result is unused. They should have already
23217 // been transformed into a cmpxchg loop in AtomicExpand.
23218 if (N->hasAnyUseOfValue(0)) {
23219 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23220 // select LXADD if LOCK_SUB can't be selected.
23221 if (Opc == ISD::ATOMIC_LOAD_SUB) {
23222 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23223 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23224 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23225 RHS, AN->getMemOperand());
23227 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23228 "Used AtomicRMW ops other than Add should have been expanded!");
23232 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23233 // RAUW the chain, but don't worry about the result, as it's unused.
23234 assert(!N->hasAnyUseOfValue(0));
23235 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23239 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23240 SDNode *Node = Op.getNode();
23242 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23244 // Convert seq_cst store -> xchg
23245 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23246 // FIXME: On 32-bit, store -> fist or movq would be more efficient
23247 // (The only way to get a 16-byte store is cmpxchg16b)
23248 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23249 if (cast<AtomicSDNode>(Node)->getOrdering() ==
23250 AtomicOrdering::SequentiallyConsistent ||
23251 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23252 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23253 cast<AtomicSDNode>(Node)->getMemoryVT(),
23254 Node->getOperand(0),
23255 Node->getOperand(1), Node->getOperand(2),
23256 cast<AtomicSDNode>(Node)->getMemOperand());
23257 return Swap.getValue(1);
23259 // Other atomic stores have a simple pattern.
23263 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23264 SDNode *N = Op.getNode();
23265 MVT VT = N->getSimpleValueType(0);
23267 // Let legalize expand this if it isn't a legal type yet.
23268 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23271 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23274 // Set the carry flag.
23275 SDValue Carry = Op.getOperand(2);
23276 EVT CarryVT = Carry.getValueType();
23277 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23278 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23279 Carry, DAG.getConstant(NegOne, DL, CarryVT));
23281 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
23282 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23283 Op.getOperand(1), Carry.getValue(1));
23285 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23286 if (N->getValueType(1) == MVT::i1)
23287 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23289 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23292 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23293 SelectionDAG &DAG) {
23294 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23296 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23297 // which returns the values as { float, float } (in XMM0) or
23298 // { double, double } (which is returned in XMM0, XMM1).
23300 SDValue Arg = Op.getOperand(0);
23301 EVT ArgVT = Arg.getValueType();
23302 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23304 TargetLowering::ArgListTy Args;
23305 TargetLowering::ArgListEntry Entry;
23309 Entry.IsSExt = false;
23310 Entry.IsZExt = false;
23311 Args.push_back(Entry);
23313 bool isF64 = ArgVT == MVT::f64;
23314 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23315 // the small struct {f32, f32} is returned in (eax, edx). For f64,
23316 // the results are returned via SRet in memory.
23317 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
23318 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23320 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23322 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
23323 : (Type *)VectorType::get(ArgTy, 4);
23325 TargetLowering::CallLoweringInfo CLI(DAG);
23326 CLI.setDebugLoc(dl)
23327 .setChain(DAG.getEntryNode())
23328 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23330 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23333 // Returned in xmm0 and xmm1.
23334 return CallResult.first;
23336 // Returned in bits 0:31 and 32:64 xmm0.
23337 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23338 CallResult.first, DAG.getIntPtrConstant(0, dl));
23339 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23340 CallResult.first, DAG.getIntPtrConstant(1, dl));
23341 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23342 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23345 /// Widen a vector input to a vector of NVT. The
23346 /// input vector must have the same element type as NVT.
23347 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23348 bool FillWithZeroes = false) {
23349 // Check if InOp already has the right width.
23350 MVT InVT = InOp.getSimpleValueType();
23354 if (InOp.isUndef())
23355 return DAG.getUNDEF(NVT);
23357 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23358 "input and widen element type must match");
23360 unsigned InNumElts = InVT.getVectorNumElements();
23361 unsigned WidenNumElts = NVT.getVectorNumElements();
23362 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23363 "Unexpected request for vector widening");
23366 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23367 InOp.getNumOperands() == 2) {
23368 SDValue N1 = InOp.getOperand(1);
23369 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
23371 InOp = InOp.getOperand(0);
23372 InVT = InOp.getSimpleValueType();
23373 InNumElts = InVT.getVectorNumElements();
23376 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23377 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23378 SmallVector<SDValue, 16> Ops;
23379 for (unsigned i = 0; i < InNumElts; ++i)
23380 Ops.push_back(InOp.getOperand(i));
23382 EVT EltVT = InOp.getOperand(0).getValueType();
23384 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23385 DAG.getUNDEF(EltVT);
23386 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23387 Ops.push_back(FillVal);
23388 return DAG.getBuildVector(NVT, dl, Ops);
23390 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23392 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23393 InOp, DAG.getIntPtrConstant(0, dl));
23396 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23397 SelectionDAG &DAG) {
23398 assert(Subtarget.hasAVX512() &&
23399 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23401 // X86 scatter kills mask register, so its type should be added to
23402 // the list of return values.
23403 // If the "scatter" has 2 return values, it is already handled.
23404 if (Op.getNode()->getNumValues() == 2)
23407 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23408 SDValue Src = N->getValue();
23409 MVT VT = Src.getSimpleValueType();
23410 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23413 SDValue NewScatter;
23414 SDValue Index = N->getIndex();
23415 SDValue Mask = N->getMask();
23416 SDValue Chain = N->getChain();
23417 SDValue BasePtr = N->getBasePtr();
23418 MVT MemVT = N->getMemoryVT().getSimpleVT();
23419 MVT IndexVT = Index.getSimpleValueType();
23420 MVT MaskVT = Mask.getSimpleValueType();
23422 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23423 // The v2i32 value was promoted to v2i64.
23424 // Now we "redo" the type legalizer's work and widen the original
23425 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23427 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23428 "Unexpected memory type");
23429 int ShuffleMask[] = {0, 2, -1, -1};
23430 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23431 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23432 // Now we have 4 elements instead of 2.
23433 // Expand the index.
23434 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23435 Index = ExtendToType(Index, NewIndexVT, DAG);
23437 // Expand the mask with zeroes
23438 // Mask may be <2 x i64> or <2 x i1> at this moment
23439 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23440 "Unexpected mask type");
23441 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23442 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23446 unsigned NumElts = VT.getVectorNumElements();
23447 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23448 !Index.getSimpleValueType().is512BitVector()) {
23449 // AVX512F supports only 512-bit vectors. Or data or index should
23450 // be 512 bit wide. If now the both index and data are 256-bit, but
23451 // the vector contains 8 elements, we just sign-extend the index
23452 if (IndexVT == MVT::v8i32)
23453 // Just extend index
23454 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23456 // The minimal number of elts in scatter is 8
23459 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23460 // Use original index here, do not modify the index twice
23461 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23462 if (IndexVT.getScalarType() == MVT::i32)
23463 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23466 // At this point we have promoted mask operand
23467 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23468 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23469 // Use the original mask here, do not modify the mask twice
23470 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23472 // The value that should be stored
23473 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23474 Src = ExtendToType(Src, NewVT, DAG);
23477 // If the mask is "wide" at this point - truncate it to i1 vector
23478 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23479 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23481 // The mask is killed by scatter, add it to the values
23482 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23483 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23484 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23485 N->getMemOperand());
23486 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23487 return SDValue(NewScatter.getNode(), 1);
23490 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23491 SelectionDAG &DAG) {
23493 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23494 MVT VT = Op.getSimpleValueType();
23495 MVT ScalarVT = VT.getScalarType();
23496 SDValue Mask = N->getMask();
23499 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23500 "Expanding masked load is supported on AVX-512 target only!");
23502 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23503 "Expanding masked load is supported for 32 and 64-bit types only!");
23505 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23506 // VLX. These types for exp-loads are handled here.
23507 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23510 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23511 "Cannot lower masked load op.");
23513 assert((ScalarVT.getSizeInBits() >= 32 ||
23514 (Subtarget.hasBWI() &&
23515 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23516 "Unsupported masked load op.");
23518 // This operation is legal for targets with VLX, but without
23519 // VLX the vector should be widened to 512 bit
23520 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23521 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23522 SDValue Src0 = N->getSrc0();
23523 Src0 = ExtendToType(Src0, WideDataVT, DAG);
23525 // Mask element has to be i1.
23526 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23527 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23528 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23530 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23532 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23533 if (MaskEltTy != MVT::i1)
23534 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23535 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23536 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23537 N->getBasePtr(), Mask, Src0,
23538 N->getMemoryVT(), N->getMemOperand(),
23539 N->getExtensionType(),
23540 N->isExpandingLoad());
23542 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23543 NewLoad.getValue(0),
23544 DAG.getIntPtrConstant(0, dl));
23545 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23546 return DAG.getMergeValues(RetOps, dl);
23549 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23550 SelectionDAG &DAG) {
23551 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23552 SDValue DataToStore = N->getValue();
23553 MVT VT = DataToStore.getSimpleValueType();
23554 MVT ScalarVT = VT.getScalarType();
23555 SDValue Mask = N->getMask();
23558 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23559 "Expanding masked load is supported on AVX-512 target only!");
23561 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23562 "Expanding masked load is supported for 32 and 64-bit types only!");
23564 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23565 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23568 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23569 "Cannot lower masked store op.");
23571 assert((ScalarVT.getSizeInBits() >= 32 ||
23572 (Subtarget.hasBWI() &&
23573 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23574 "Unsupported masked store op.");
23576 // This operation is legal for targets with VLX, but without
23577 // VLX the vector should be widened to 512 bit
23578 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23579 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23581 // Mask element has to be i1.
23582 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23583 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23584 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23586 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23588 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23589 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23590 if (MaskEltTy != MVT::i1)
23591 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23592 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23593 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23594 Mask, N->getMemoryVT(), N->getMemOperand(),
23595 N->isTruncatingStore(), N->isCompressingStore());
23598 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23599 SelectionDAG &DAG) {
23600 assert(Subtarget.hasAVX512() &&
23601 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23603 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23605 MVT VT = Op.getSimpleValueType();
23606 SDValue Index = N->getIndex();
23607 SDValue Mask = N->getMask();
23608 SDValue Src0 = N->getValue();
23609 MVT IndexVT = Index.getSimpleValueType();
23610 MVT MaskVT = Mask.getSimpleValueType();
23612 unsigned NumElts = VT.getVectorNumElements();
23613 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23615 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23616 !Index.getSimpleValueType().is512BitVector()) {
23617 // AVX512F supports only 512-bit vectors. Or data or index should
23618 // be 512 bit wide. If now the both index and data are 256-bit, but
23619 // the vector contains 8 elements, we just sign-extend the index
23620 if (NumElts == 8) {
23621 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23622 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
23623 N->getOperand(3), Index };
23624 DAG.UpdateNodeOperands(N, Ops);
23628 // Minimal number of elements in Gather
23631 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23632 Index = ExtendToType(Index, NewIndexVT, DAG);
23633 if (IndexVT.getScalarType() == MVT::i32)
23634 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23637 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23638 // At this point we have promoted mask operand
23639 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23640 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23641 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23642 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23644 // The pass-through value
23645 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23646 Src0 = ExtendToType(Src0, NewVT, DAG);
23648 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23649 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23650 N->getMemoryVT(), dl, Ops,
23651 N->getMemOperand());
23652 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23653 NewGather.getValue(0),
23654 DAG.getIntPtrConstant(0, dl));
23655 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23656 return DAG.getMergeValues(RetOps, dl);
23661 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23662 SelectionDAG &DAG) const {
23663 // TODO: Eventually, the lowering of these nodes should be informed by or
23664 // deferred to the GC strategy for the function in which they appear. For
23665 // now, however, they must be lowered to something. Since they are logically
23666 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23667 // require special handling for these nodes), lower them as literal NOOPs for
23669 SmallVector<SDValue, 2> Ops;
23671 Ops.push_back(Op.getOperand(0));
23672 if (Op->getGluedNode())
23673 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23676 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23677 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23682 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23683 SelectionDAG &DAG) const {
23684 // TODO: Eventually, the lowering of these nodes should be informed by or
23685 // deferred to the GC strategy for the function in which they appear. For
23686 // now, however, they must be lowered to something. Since they are logically
23687 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23688 // require special handling for these nodes), lower them as literal NOOPs for
23690 SmallVector<SDValue, 2> Ops;
23692 Ops.push_back(Op.getOperand(0));
23693 if (Op->getGluedNode())
23694 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23697 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23698 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23703 /// Provide custom lowering hooks for some operations.
23704 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
23705 switch (Op.getOpcode()) {
23706 default: llvm_unreachable("Should not custom lower this!");
23707 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
23708 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
23709 return LowerCMP_SWAP(Op, Subtarget, DAG);
23710 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
23711 case ISD::ATOMIC_LOAD_ADD:
23712 case ISD::ATOMIC_LOAD_SUB:
23713 case ISD::ATOMIC_LOAD_OR:
23714 case ISD::ATOMIC_LOAD_XOR:
23715 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
23716 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
23717 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
23718 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
23719 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23720 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
23721 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
23722 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23723 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
23724 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23725 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23726 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
23727 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
23728 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
23729 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
23730 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
23731 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
23732 case ISD::SHL_PARTS:
23733 case ISD::SRA_PARTS:
23734 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
23735 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
23736 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
23737 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
23738 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
23739 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23740 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
23741 case ISD::ZERO_EXTEND_VECTOR_INREG:
23742 case ISD::SIGN_EXTEND_VECTOR_INREG:
23743 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23744 case ISD::FP_TO_SINT:
23745 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
23746 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
23747 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
23749 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
23750 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
23751 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
23752 case ISD::SETCC: return LowerSETCC(Op, DAG);
23753 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
23754 case ISD::SELECT: return LowerSELECT(Op, DAG);
23755 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
23756 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
23757 case ISD::VASTART: return LowerVASTART(Op, DAG);
23758 case ISD::VAARG: return LowerVAARG(Op, DAG);
23759 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
23760 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23761 case ISD::INTRINSIC_VOID:
23762 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23763 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
23764 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
23765 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
23766 case ISD::FRAME_TO_ARGS_OFFSET:
23767 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23768 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23769 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
23770 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
23771 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
23772 case ISD::EH_SJLJ_SETUP_DISPATCH:
23773 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23774 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
23775 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
23776 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
23778 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
23780 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
23781 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
23783 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
23784 case ISD::UMUL_LOHI:
23785 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
23786 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
23789 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
23795 case ISD::UMULO: return LowerXALUO(Op, DAG);
23796 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23797 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
23798 case ISD::ADDCARRY:
23799 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
23801 case ISD::SUB: return LowerADD_SUB(Op, DAG);
23805 case ISD::UMIN: return LowerMINMAX(Op, DAG);
23806 case ISD::ABS: return LowerABS(Op, DAG);
23807 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
23808 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
23809 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
23810 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
23811 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
23812 case ISD::GC_TRANSITION_START:
23813 return LowerGC_TRANSITION_START(Op, DAG);
23814 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
23815 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
23819 /// Places new result values for the node in Results (their number
23820 /// and types must exactly match those of the original return values of
23821 /// the node), or leaves Results empty, which indicates that the node is not
23822 /// to be custom lowered after all.
23823 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
23824 SmallVectorImpl<SDValue> &Results,
23825 SelectionDAG &DAG) const {
23826 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
23828 if (!Res.getNode())
23831 assert((N->getNumValues() <= Res->getNumValues()) &&
23832 "Lowering returned the wrong number of results!");
23834 // Places new result values base on N result number.
23835 // In some cases (LowerSINT_TO_FP for example) Res has more result values
23836 // than original node, chain should be dropped(last value).
23837 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
23838 Results.push_back(Res.getValue(I));
23841 /// Replace a node with an illegal result type with a new node built out of
23843 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
23844 SmallVectorImpl<SDValue>&Results,
23845 SelectionDAG &DAG) const {
23847 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23848 switch (N->getOpcode()) {
23850 llvm_unreachable("Do not know how to custom type legalize this operation!");
23851 case X86ISD::AVG: {
23852 // Legalize types for X86ISD::AVG by expanding vectors.
23853 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23855 auto InVT = N->getValueType(0);
23856 auto InVTSize = InVT.getSizeInBits();
23857 const unsigned RegSize =
23858 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
23859 assert((Subtarget.hasBWI() || RegSize < 512) &&
23860 "512-bit vector requires AVX512BW");
23861 assert((Subtarget.hasAVX2() || RegSize < 256) &&
23862 "256-bit vector requires AVX2");
23864 auto ElemVT = InVT.getVectorElementType();
23865 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
23866 RegSize / ElemVT.getSizeInBits());
23867 assert(RegSize % InVT.getSizeInBits() == 0);
23868 unsigned NumConcat = RegSize / InVT.getSizeInBits();
23870 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
23871 Ops[0] = N->getOperand(0);
23872 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23873 Ops[0] = N->getOperand(1);
23874 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23876 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
23877 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
23878 DAG.getIntPtrConstant(0, dl)));
23881 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
23882 case X86ISD::FMINC:
23884 case X86ISD::FMAXC:
23885 case X86ISD::FMAX: {
23886 EVT VT = N->getValueType(0);
23887 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
23888 SDValue UNDEF = DAG.getUNDEF(VT);
23889 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23890 N->getOperand(0), UNDEF);
23891 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23892 N->getOperand(1), UNDEF);
23893 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
23901 case ISD::UDIVREM: {
23902 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
23903 Results.push_back(V);
23906 case ISD::FP_TO_SINT:
23907 case ISD::FP_TO_UINT: {
23908 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
23910 if (N->getValueType(0) == MVT::v2i32) {
23911 assert((IsSigned || Subtarget.hasAVX512()) &&
23912 "Can only handle signed conversion without AVX512");
23913 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23914 SDValue Src = N->getOperand(0);
23915 if (Src.getValueType() == MVT::v2f64) {
23916 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23917 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
23918 : X86ISD::CVTTP2UI,
23919 dl, MVT::v4i32, Src);
23920 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23921 Results.push_back(Res);
23924 if (Src.getValueType() == MVT::v2f32) {
23925 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23926 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23927 DAG.getUNDEF(MVT::v2f32));
23928 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
23929 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
23930 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23931 Results.push_back(Res);
23935 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
23936 // so early out here.
23940 std::pair<SDValue,SDValue> Vals =
23941 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
23942 SDValue FIST = Vals.first, StackSlot = Vals.second;
23943 if (FIST.getNode()) {
23944 EVT VT = N->getValueType(0);
23945 // Return a load from the stack slot.
23946 if (StackSlot.getNode())
23948 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
23950 Results.push_back(FIST);
23954 case ISD::SINT_TO_FP: {
23955 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
23956 SDValue Src = N->getOperand(0);
23957 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
23959 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
23962 case ISD::UINT_TO_FP: {
23963 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23964 EVT VT = N->getValueType(0);
23965 if (VT != MVT::v2f32)
23967 SDValue Src = N->getOperand(0);
23968 EVT SrcVT = Src.getValueType();
23969 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
23970 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
23973 if (SrcVT != MVT::v2i32)
23975 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
23977 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
23978 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
23979 DAG.getBitcast(MVT::v2i64, VBias));
23980 Or = DAG.getBitcast(MVT::v2f64, Or);
23981 // TODO: Are there any fast-math-flags to propagate here?
23982 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
23983 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
23986 case ISD::FP_ROUND: {
23987 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
23989 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
23990 Results.push_back(V);
23993 case ISD::FP_EXTEND: {
23994 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
23995 // No other ValueType for FP_EXTEND should reach this point.
23996 assert(N->getValueType(0) == MVT::v2f32 &&
23997 "Do not know how to legalize this Node");
24000 case ISD::INTRINSIC_W_CHAIN: {
24001 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24003 default : llvm_unreachable("Do not know how to custom type "
24004 "legalize this intrinsic operation!");
24005 case Intrinsic::x86_rdtsc:
24006 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24008 case Intrinsic::x86_rdtscp:
24009 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24011 case Intrinsic::x86_rdpmc:
24012 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24014 case Intrinsic::x86_xgetbv:
24015 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24018 case ISD::INTRINSIC_WO_CHAIN: {
24019 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24020 Results.push_back(V);
24023 case ISD::READCYCLECOUNTER: {
24024 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24027 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24028 EVT T = N->getValueType(0);
24029 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24030 bool Regs64bit = T == MVT::i128;
24031 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24032 SDValue cpInL, cpInH;
24033 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24034 DAG.getConstant(0, dl, HalfT));
24035 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24036 DAG.getConstant(1, dl, HalfT));
24037 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24038 Regs64bit ? X86::RAX : X86::EAX,
24040 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24041 Regs64bit ? X86::RDX : X86::EDX,
24042 cpInH, cpInL.getValue(1));
24043 SDValue swapInL, swapInH;
24044 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24045 DAG.getConstant(0, dl, HalfT));
24046 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24047 DAG.getConstant(1, dl, HalfT));
24049 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24050 swapInH, cpInH.getValue(1));
24051 // If the current function needs the base pointer, RBX,
24052 // we shouldn't use cmpxchg directly.
24053 // Indeed the lowering of that instruction will clobber
24054 // that register and since RBX will be a reserved register
24055 // the register allocator will not make sure its value will
24056 // be properly saved and restored around this live-range.
24057 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24059 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24060 unsigned BasePtr = TRI->getBaseRegister();
24061 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24062 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24063 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24064 // ISel prefers the LCMPXCHG64 variant.
24065 // If that assert breaks, that means it is not the case anymore,
24066 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24067 // not just EBX. This is a matter of accepting i64 input for that
24068 // pseudo, and restoring into the register of the right wide
24069 // in expand pseudo. Everything else should just work.
24070 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24071 "Saving only half of the RBX");
24072 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24073 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24074 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24075 Regs64bit ? X86::RBX : X86::EBX,
24076 HalfT, swapInH.getValue(1));
24077 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24079 /*Glue*/ RBXSave.getValue(2)};
24080 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24083 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24084 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24085 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24086 swapInH.getValue(1));
24087 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24088 swapInL.getValue(1)};
24089 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24091 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24092 Regs64bit ? X86::RAX : X86::EAX,
24093 HalfT, Result.getValue(1));
24094 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24095 Regs64bit ? X86::RDX : X86::EDX,
24096 HalfT, cpOutL.getValue(2));
24097 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24099 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24100 MVT::i32, cpOutH.getValue(2));
24101 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24102 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24104 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24105 Results.push_back(Success);
24106 Results.push_back(EFLAGS.getValue(1));
24109 case ISD::ATOMIC_SWAP:
24110 case ISD::ATOMIC_LOAD_ADD:
24111 case ISD::ATOMIC_LOAD_SUB:
24112 case ISD::ATOMIC_LOAD_AND:
24113 case ISD::ATOMIC_LOAD_OR:
24114 case ISD::ATOMIC_LOAD_XOR:
24115 case ISD::ATOMIC_LOAD_NAND:
24116 case ISD::ATOMIC_LOAD_MIN:
24117 case ISD::ATOMIC_LOAD_MAX:
24118 case ISD::ATOMIC_LOAD_UMIN:
24119 case ISD::ATOMIC_LOAD_UMAX:
24120 case ISD::ATOMIC_LOAD: {
24121 // Delegate to generic TypeLegalization. Situations we can really handle
24122 // should have already been dealt with by AtomicExpandPass.cpp.
24125 case ISD::BITCAST: {
24126 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24127 EVT DstVT = N->getValueType(0);
24128 EVT SrcVT = N->getOperand(0)->getValueType(0);
24130 if (SrcVT != MVT::f64 ||
24131 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24134 unsigned NumElts = DstVT.getVectorNumElements();
24135 EVT SVT = DstVT.getVectorElementType();
24136 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24137 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24138 MVT::v2f64, N->getOperand(0));
24139 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24141 if (ExperimentalVectorWideningLegalization) {
24142 // If we are legalizing vectors by widening, we already have the desired
24143 // legal vector type, just return it.
24144 Results.push_back(ToVecInt);
24148 SmallVector<SDValue, 8> Elts;
24149 for (unsigned i = 0, e = NumElts; i != e; ++i)
24150 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24151 ToVecInt, DAG.getIntPtrConstant(i, dl)));
24153 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24158 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24159 switch ((X86ISD::NodeType)Opcode) {
24160 case X86ISD::FIRST_NUMBER: break;
24161 case X86ISD::BSF: return "X86ISD::BSF";
24162 case X86ISD::BSR: return "X86ISD::BSR";
24163 case X86ISD::SHLD: return "X86ISD::SHLD";
24164 case X86ISD::SHRD: return "X86ISD::SHRD";
24165 case X86ISD::FAND: return "X86ISD::FAND";
24166 case X86ISD::FANDN: return "X86ISD::FANDN";
24167 case X86ISD::FOR: return "X86ISD::FOR";
24168 case X86ISD::FXOR: return "X86ISD::FXOR";
24169 case X86ISD::FILD: return "X86ISD::FILD";
24170 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
24171 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24172 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24173 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24174 case X86ISD::FLD: return "X86ISD::FLD";
24175 case X86ISD::FST: return "X86ISD::FST";
24176 case X86ISD::CALL: return "X86ISD::CALL";
24177 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
24178 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
24179 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
24180 case X86ISD::BT: return "X86ISD::BT";
24181 case X86ISD::CMP: return "X86ISD::CMP";
24182 case X86ISD::COMI: return "X86ISD::COMI";
24183 case X86ISD::UCOMI: return "X86ISD::UCOMI";
24184 case X86ISD::CMPM: return "X86ISD::CMPM";
24185 case X86ISD::CMPMU: return "X86ISD::CMPMU";
24186 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
24187 case X86ISD::SETCC: return "X86ISD::SETCC";
24188 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
24189 case X86ISD::FSETCC: return "X86ISD::FSETCC";
24190 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
24191 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
24192 case X86ISD::CMOV: return "X86ISD::CMOV";
24193 case X86ISD::BRCOND: return "X86ISD::BRCOND";
24194 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
24195 case X86ISD::IRET: return "X86ISD::IRET";
24196 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
24197 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
24198 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
24199 case X86ISD::Wrapper: return "X86ISD::Wrapper";
24200 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
24201 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
24202 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
24203 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
24204 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
24205 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
24206 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
24207 case X86ISD::PINSRB: return "X86ISD::PINSRB";
24208 case X86ISD::PINSRW: return "X86ISD::PINSRW";
24209 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
24210 case X86ISD::ANDNP: return "X86ISD::ANDNP";
24211 case X86ISD::BLENDI: return "X86ISD::BLENDI";
24212 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
24213 case X86ISD::ADDUS: return "X86ISD::ADDUS";
24214 case X86ISD::SUBUS: return "X86ISD::SUBUS";
24215 case X86ISD::HADD: return "X86ISD::HADD";
24216 case X86ISD::HSUB: return "X86ISD::HSUB";
24217 case X86ISD::FHADD: return "X86ISD::FHADD";
24218 case X86ISD::FHSUB: return "X86ISD::FHSUB";
24219 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
24220 case X86ISD::FMAX: return "X86ISD::FMAX";
24221 case X86ISD::FMAXS: return "X86ISD::FMAXS";
24222 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
24223 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
24224 case X86ISD::FMIN: return "X86ISD::FMIN";
24225 case X86ISD::FMINS: return "X86ISD::FMINS";
24226 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
24227 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
24228 case X86ISD::FMAXC: return "X86ISD::FMAXC";
24229 case X86ISD::FMINC: return "X86ISD::FMINC";
24230 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
24231 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
24232 case X86ISD::FRCP: return "X86ISD::FRCP";
24233 case X86ISD::FRCPS: return "X86ISD::FRCPS";
24234 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
24235 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
24236 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
24237 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
24238 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
24239 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
24240 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
24241 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24242 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24243 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
24244 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
24245 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
24246 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
24247 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
24248 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
24249 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
24250 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24251 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24252 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24253 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24254 case X86ISD::LADD: return "X86ISD::LADD";
24255 case X86ISD::LSUB: return "X86ISD::LSUB";
24256 case X86ISD::LOR: return "X86ISD::LOR";
24257 case X86ISD::LXOR: return "X86ISD::LXOR";
24258 case X86ISD::LAND: return "X86ISD::LAND";
24259 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
24260 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
24261 case X86ISD::VZEXT: return "X86ISD::VZEXT";
24262 case X86ISD::VSEXT: return "X86ISD::VSEXT";
24263 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
24264 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
24265 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
24266 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
24267 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
24268 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
24269 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
24270 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
24271 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
24272 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
24273 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
24274 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
24275 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
24276 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
24277 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
24278 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
24279 case X86ISD::VSHL: return "X86ISD::VSHL";
24280 case X86ISD::VSRL: return "X86ISD::VSRL";
24281 case X86ISD::VSRA: return "X86ISD::VSRA";
24282 case X86ISD::VSHLI: return "X86ISD::VSHLI";
24283 case X86ISD::VSRLI: return "X86ISD::VSRLI";
24284 case X86ISD::VSRAI: return "X86ISD::VSRAI";
24285 case X86ISD::VSRAV: return "X86ISD::VSRAV";
24286 case X86ISD::VROTLI: return "X86ISD::VROTLI";
24287 case X86ISD::VROTRI: return "X86ISD::VROTRI";
24288 case X86ISD::VPPERM: return "X86ISD::VPPERM";
24289 case X86ISD::CMPP: return "X86ISD::CMPP";
24290 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
24291 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
24292 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
24293 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
24294 case X86ISD::ADD: return "X86ISD::ADD";
24295 case X86ISD::SUB: return "X86ISD::SUB";
24296 case X86ISD::ADC: return "X86ISD::ADC";
24297 case X86ISD::SBB: return "X86ISD::SBB";
24298 case X86ISD::SMUL: return "X86ISD::SMUL";
24299 case X86ISD::UMUL: return "X86ISD::UMUL";
24300 case X86ISD::SMUL8: return "X86ISD::SMUL8";
24301 case X86ISD::UMUL8: return "X86ISD::UMUL8";
24302 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24303 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24304 case X86ISD::INC: return "X86ISD::INC";
24305 case X86ISD::DEC: return "X86ISD::DEC";
24306 case X86ISD::OR: return "X86ISD::OR";
24307 case X86ISD::XOR: return "X86ISD::XOR";
24308 case X86ISD::AND: return "X86ISD::AND";
24309 case X86ISD::BEXTR: return "X86ISD::BEXTR";
24310 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
24311 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
24312 case X86ISD::PTEST: return "X86ISD::PTEST";
24313 case X86ISD::TESTP: return "X86ISD::TESTP";
24314 case X86ISD::TESTM: return "X86ISD::TESTM";
24315 case X86ISD::TESTNM: return "X86ISD::TESTNM";
24316 case X86ISD::KORTEST: return "X86ISD::KORTEST";
24317 case X86ISD::KTEST: return "X86ISD::KTEST";
24318 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
24319 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
24320 case X86ISD::PACKSS: return "X86ISD::PACKSS";
24321 case X86ISD::PACKUS: return "X86ISD::PACKUS";
24322 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
24323 case X86ISD::VALIGN: return "X86ISD::VALIGN";
24324 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
24325 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
24326 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
24327 case X86ISD::SHUFP: return "X86ISD::SHUFP";
24328 case X86ISD::SHUF128: return "X86ISD::SHUF128";
24329 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
24330 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
24331 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
24332 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
24333 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
24334 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
24335 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
24336 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
24337 case X86ISD::MOVSD: return "X86ISD::MOVSD";
24338 case X86ISD::MOVSS: return "X86ISD::MOVSS";
24339 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
24340 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
24341 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
24342 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
24343 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
24344 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
24345 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
24346 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
24347 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
24348 case X86ISD::VPERMV: return "X86ISD::VPERMV";
24349 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
24350 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
24351 case X86ISD::VPERMI: return "X86ISD::VPERMI";
24352 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
24353 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
24354 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
24355 case X86ISD::VRANGE: return "X86ISD::VRANGE";
24356 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
24357 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
24358 case X86ISD::PSADBW: return "X86ISD::PSADBW";
24359 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
24360 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24361 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
24362 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
24363 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
24364 case X86ISD::MFENCE: return "X86ISD::MFENCE";
24365 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
24366 case X86ISD::SAHF: return "X86ISD::SAHF";
24367 case X86ISD::RDRAND: return "X86ISD::RDRAND";
24368 case X86ISD::RDSEED: return "X86ISD::RDSEED";
24369 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
24370 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
24371 case X86ISD::VPROT: return "X86ISD::VPROT";
24372 case X86ISD::VPROTI: return "X86ISD::VPROTI";
24373 case X86ISD::VPSHA: return "X86ISD::VPSHA";
24374 case X86ISD::VPSHL: return "X86ISD::VPSHL";
24375 case X86ISD::VPCOM: return "X86ISD::VPCOM";
24376 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
24377 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
24378 case X86ISD::FMADD: return "X86ISD::FMADD";
24379 case X86ISD::FMSUB: return "X86ISD::FMSUB";
24380 case X86ISD::FNMADD: return "X86ISD::FNMADD";
24381 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
24382 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
24383 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
24384 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
24385 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
24386 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
24387 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
24388 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
24389 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
24390 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
24391 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
24392 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
24393 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
24394 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
24395 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
24396 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
24397 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
24398 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
24399 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
24400 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
24401 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
24402 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
24403 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
24404 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
24405 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
24406 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
24407 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
24408 case X86ISD::XTEST: return "X86ISD::XTEST";
24409 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
24410 case X86ISD::EXPAND: return "X86ISD::EXPAND";
24411 case X86ISD::SELECT: return "X86ISD::SELECT";
24412 case X86ISD::SELECTS: return "X86ISD::SELECTS";
24413 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
24414 case X86ISD::RCP28: return "X86ISD::RCP28";
24415 case X86ISD::RCP28S: return "X86ISD::RCP28S";
24416 case X86ISD::EXP2: return "X86ISD::EXP2";
24417 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
24418 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
24419 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
24420 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
24421 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
24422 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
24423 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
24424 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
24425 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
24426 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
24427 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
24428 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
24429 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
24430 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
24431 case X86ISD::SCALEF: return "X86ISD::SCALEF";
24432 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
24433 case X86ISD::ADDS: return "X86ISD::ADDS";
24434 case X86ISD::SUBS: return "X86ISD::SUBS";
24435 case X86ISD::AVG: return "X86ISD::AVG";
24436 case X86ISD::MULHRS: return "X86ISD::MULHRS";
24437 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
24438 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
24439 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
24440 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
24441 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
24442 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
24443 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
24444 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
24445 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
24446 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
24447 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
24448 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
24449 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
24450 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24451 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24452 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
24453 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
24454 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
24455 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
24456 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
24457 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
24458 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
24459 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
24460 case X86ISD::LWPINS: return "X86ISD::LWPINS";
24465 /// Return true if the addressing mode represented by AM is legal for this
24466 /// target, for a load/store of the specified type.
24467 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24468 const AddrMode &AM, Type *Ty,
24469 unsigned AS) const {
24470 // X86 supports extremely general addressing modes.
24471 CodeModel::Model M = getTargetMachine().getCodeModel();
24473 // X86 allows a sign-extended 32-bit immediate field as a displacement.
24474 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24478 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24480 // If a reference to this global requires an extra load, we can't fold it.
24481 if (isGlobalStubReference(GVFlags))
24484 // If BaseGV requires a register for the PIC base, we cannot also have a
24485 // BaseReg specified.
24486 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24489 // If lower 4G is not available, then we must use rip-relative addressing.
24490 if ((M != CodeModel::Small || isPositionIndependent()) &&
24491 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24495 switch (AM.Scale) {
24501 // These scales always work.
24506 // These scales are formed with basereg+scalereg. Only accept if there is
24511 default: // Other stuff never works.
24518 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24519 unsigned Bits = Ty->getScalarSizeInBits();
24521 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24522 // particularly cheaper than those without.
24526 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24527 // variable shifts just as cheap as scalar ones.
24528 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24531 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24532 // fully general vector.
24536 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24537 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24539 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24540 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24541 return NumBits1 > NumBits2;
24544 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24545 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24548 if (!isTypeLegal(EVT::getEVT(Ty1)))
24551 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24553 // Assuming the caller doesn't have a zeroext or signext return parameter,
24554 // truncation all the way down to i1 is valid.
24558 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24559 return isInt<32>(Imm);
24562 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24563 // Can also use sub to handle negated immediates.
24564 return isInt<32>(Imm);
24567 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24568 if (!VT1.isInteger() || !VT2.isInteger())
24570 unsigned NumBits1 = VT1.getSizeInBits();
24571 unsigned NumBits2 = VT2.getSizeInBits();
24572 return NumBits1 > NumBits2;
24575 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24576 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24577 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24580 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24581 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24582 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24585 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24586 EVT VT1 = Val.getValueType();
24587 if (isZExtFree(VT1, VT2))
24590 if (Val.getOpcode() != ISD::LOAD)
24593 if (!VT1.isSimple() || !VT1.isInteger() ||
24594 !VT2.isSimple() || !VT2.isInteger())
24597 switch (VT1.getSimpleVT().SimpleTy) {
24602 // X86 has 8, 16, and 32-bit zero-extending loads.
24609 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24612 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24613 if (!Subtarget.hasAnyFMA())
24616 VT = VT.getScalarType();
24618 if (!VT.isSimple())
24621 switch (VT.getSimpleVT().SimpleTy) {
24632 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24633 // i16 instructions are longer (0x66 prefix) and potentially slower.
24634 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24637 /// Targets can use this to indicate that they only support *some*
24638 /// VECTOR_SHUFFLE operations, those with specific masks.
24639 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24640 /// are assumed to be legal.
24642 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24644 if (!VT.isSimple())
24647 // Not for i1 vectors
24648 if (VT.getSimpleVT().getScalarType() == MVT::i1)
24651 // Very little shuffling can be done for 64-bit vectors right now.
24652 if (VT.getSimpleVT().getSizeInBits() == 64)
24655 // We only care that the types being shuffled are legal. The lowering can
24656 // handle any possible shuffle mask that results.
24657 return isTypeLegal(VT.getSimpleVT());
24661 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24663 // Just delegate to the generic legality, clear masks aren't special.
24664 return isShuffleMaskLegal(Mask, VT);
24667 //===----------------------------------------------------------------------===//
24668 // X86 Scheduler Hooks
24669 //===----------------------------------------------------------------------===//
24671 /// Utility function to emit xbegin specifying the start of an RTM region.
24672 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24673 const TargetInstrInfo *TII) {
24674 DebugLoc DL = MI.getDebugLoc();
24676 const BasicBlock *BB = MBB->getBasicBlock();
24677 MachineFunction::iterator I = ++MBB->getIterator();
24679 // For the v = xbegin(), we generate
24688 // eax = # XABORT_DEF
24692 // v = phi(s0/mainBB, s1/fallBB)
24694 MachineBasicBlock *thisMBB = MBB;
24695 MachineFunction *MF = MBB->getParent();
24696 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
24697 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
24698 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
24699 MF->insert(I, mainMBB);
24700 MF->insert(I, fallMBB);
24701 MF->insert(I, sinkMBB);
24703 // Transfer the remainder of BB and its successor edges to sinkMBB.
24704 sinkMBB->splice(sinkMBB->begin(), MBB,
24705 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24706 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
24708 MachineRegisterInfo &MRI = MF->getRegInfo();
24709 unsigned DstReg = MI.getOperand(0).getReg();
24710 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
24711 unsigned mainDstReg = MRI.createVirtualRegister(RC);
24712 unsigned fallDstReg = MRI.createVirtualRegister(RC);
24716 // # fallthrough to mainMBB
24717 // # abortion to fallMBB
24718 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
24719 thisMBB->addSuccessor(mainMBB);
24720 thisMBB->addSuccessor(fallMBB);
24723 // mainDstReg := -1
24724 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
24725 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
24726 mainMBB->addSuccessor(sinkMBB);
24729 // ; pseudo instruction to model hardware's definition from XABORT
24730 // EAX := XABORT_DEF
24731 // fallDstReg := EAX
24732 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
24733 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
24735 fallMBB->addSuccessor(sinkMBB);
24738 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
24739 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
24740 .addReg(mainDstReg).addMBB(mainMBB)
24741 .addReg(fallDstReg).addMBB(fallMBB);
24743 MI.eraseFromParent();
24747 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24748 // or XMM0_V32I8 in AVX all of this code can be replaced with that
24749 // in the .td file.
24750 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
24751 const TargetInstrInfo *TII) {
24753 switch (MI.getOpcode()) {
24754 default: llvm_unreachable("illegal opcode!");
24755 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
24756 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
24757 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
24758 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
24759 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
24760 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
24761 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
24762 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
24765 DebugLoc dl = MI.getDebugLoc();
24766 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24768 unsigned NumArgs = MI.getNumOperands();
24769 for (unsigned i = 1; i < NumArgs; ++i) {
24770 MachineOperand &Op = MI.getOperand(i);
24771 if (!(Op.isReg() && Op.isImplicit()))
24774 if (MI.hasOneMemOperand())
24775 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24777 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24778 .addReg(X86::XMM0);
24780 MI.eraseFromParent();
24784 // FIXME: Custom handling because TableGen doesn't support multiple implicit
24785 // defs in an instruction pattern
24786 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
24787 const TargetInstrInfo *TII) {
24789 switch (MI.getOpcode()) {
24790 default: llvm_unreachable("illegal opcode!");
24791 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
24792 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24793 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
24794 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24795 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
24796 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24797 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
24798 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
24801 DebugLoc dl = MI.getDebugLoc();
24802 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24804 unsigned NumArgs = MI.getNumOperands(); // remove the results
24805 for (unsigned i = 1; i < NumArgs; ++i) {
24806 MachineOperand &Op = MI.getOperand(i);
24807 if (!(Op.isReg() && Op.isImplicit()))
24810 if (MI.hasOneMemOperand())
24811 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24813 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24816 MI.eraseFromParent();
24820 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24821 const X86Subtarget &Subtarget) {
24822 DebugLoc dl = MI.getDebugLoc();
24823 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24825 // insert input VAL into EAX
24826 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
24827 .addReg(MI.getOperand(0).getReg());
24828 // insert zero to ECX
24829 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24831 // insert zero to EDX
24832 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
24834 // insert WRPKRU instruction
24835 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
24837 MI.eraseFromParent(); // The pseudo is gone now.
24841 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24842 const X86Subtarget &Subtarget) {
24843 DebugLoc dl = MI.getDebugLoc();
24844 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24846 // insert zero to ECX
24847 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24849 // insert RDPKRU instruction
24850 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
24851 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24854 MI.eraseFromParent(); // The pseudo is gone now.
24858 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
24859 const X86Subtarget &Subtarget,
24861 DebugLoc dl = MI.getDebugLoc();
24862 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24863 // Address into RAX/EAX, other two args into ECX, EDX.
24864 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24865 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24866 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24867 for (int i = 0; i < X86::AddrNumOperands; ++i)
24868 MIB.add(MI.getOperand(i));
24870 unsigned ValOps = X86::AddrNumOperands;
24871 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
24872 .addReg(MI.getOperand(ValOps).getReg());
24873 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
24874 .addReg(MI.getOperand(ValOps + 1).getReg());
24876 // The instruction doesn't actually take any operands though.
24877 BuildMI(*BB, MI, dl, TII->get(Opc));
24879 MI.eraseFromParent(); // The pseudo is gone now.
24883 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
24884 const X86Subtarget &Subtarget) {
24885 DebugLoc dl = MI->getDebugLoc();
24886 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24887 // Address into RAX/EAX
24888 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24889 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24890 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24891 for (int i = 0; i < X86::AddrNumOperands; ++i)
24892 MIB.add(MI->getOperand(i));
24894 // The instruction doesn't actually take any operands though.
24895 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
24897 MI->eraseFromParent(); // The pseudo is gone now.
24903 MachineBasicBlock *
24904 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
24905 MachineBasicBlock *MBB) const {
24906 // Emit va_arg instruction on X86-64.
24908 // Operands to this pseudo-instruction:
24909 // 0 ) Output : destination address (reg)
24910 // 1-5) Input : va_list address (addr, i64mem)
24911 // 6 ) ArgSize : Size (in bytes) of vararg type
24912 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
24913 // 8 ) Align : Alignment of type
24914 // 9 ) EFLAGS (implicit-def)
24916 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
24917 static_assert(X86::AddrNumOperands == 5,
24918 "VAARG_64 assumes 5 address operands");
24920 unsigned DestReg = MI.getOperand(0).getReg();
24921 MachineOperand &Base = MI.getOperand(1);
24922 MachineOperand &Scale = MI.getOperand(2);
24923 MachineOperand &Index = MI.getOperand(3);
24924 MachineOperand &Disp = MI.getOperand(4);
24925 MachineOperand &Segment = MI.getOperand(5);
24926 unsigned ArgSize = MI.getOperand(6).getImm();
24927 unsigned ArgMode = MI.getOperand(7).getImm();
24928 unsigned Align = MI.getOperand(8).getImm();
24930 // Memory Reference
24931 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
24932 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
24933 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
24935 // Machine Information
24936 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24937 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
24938 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
24939 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
24940 DebugLoc DL = MI.getDebugLoc();
24942 // struct va_list {
24945 // i64 overflow_area (address)
24946 // i64 reg_save_area (address)
24948 // sizeof(va_list) = 24
24949 // alignment(va_list) = 8
24951 unsigned TotalNumIntRegs = 6;
24952 unsigned TotalNumXMMRegs = 8;
24953 bool UseGPOffset = (ArgMode == 1);
24954 bool UseFPOffset = (ArgMode == 2);
24955 unsigned MaxOffset = TotalNumIntRegs * 8 +
24956 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
24958 /* Align ArgSize to a multiple of 8 */
24959 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
24960 bool NeedsAlign = (Align > 8);
24962 MachineBasicBlock *thisMBB = MBB;
24963 MachineBasicBlock *overflowMBB;
24964 MachineBasicBlock *offsetMBB;
24965 MachineBasicBlock *endMBB;
24967 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
24968 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
24969 unsigned OffsetReg = 0;
24971 if (!UseGPOffset && !UseFPOffset) {
24972 // If we only pull from the overflow region, we don't create a branch.
24973 // We don't need to alter control flow.
24974 OffsetDestReg = 0; // unused
24975 OverflowDestReg = DestReg;
24977 offsetMBB = nullptr;
24978 overflowMBB = thisMBB;
24981 // First emit code to check if gp_offset (or fp_offset) is below the bound.
24982 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
24983 // If not, pull from overflow_area. (branch to overflowMBB)
24988 // offsetMBB overflowMBB
24993 // Registers for the PHI in endMBB
24994 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
24995 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
24997 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24998 MachineFunction *MF = MBB->getParent();
24999 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25000 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25001 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25003 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25005 // Insert the new basic blocks
25006 MF->insert(MBBIter, offsetMBB);
25007 MF->insert(MBBIter, overflowMBB);
25008 MF->insert(MBBIter, endMBB);
25010 // Transfer the remainder of MBB and its successor edges to endMBB.
25011 endMBB->splice(endMBB->begin(), thisMBB,
25012 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25013 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25015 // Make offsetMBB and overflowMBB successors of thisMBB
25016 thisMBB->addSuccessor(offsetMBB);
25017 thisMBB->addSuccessor(overflowMBB);
25019 // endMBB is a successor of both offsetMBB and overflowMBB
25020 offsetMBB->addSuccessor(endMBB);
25021 overflowMBB->addSuccessor(endMBB);
25023 // Load the offset value into a register
25024 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25025 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25029 .addDisp(Disp, UseFPOffset ? 4 : 0)
25031 .setMemRefs(MMOBegin, MMOEnd);
25033 // Check if there is enough room left to pull this argument.
25034 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25036 .addImm(MaxOffset + 8 - ArgSizeA8);
25038 // Branch to "overflowMBB" if offset >= max
25039 // Fall through to "offsetMBB" otherwise
25040 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25041 .addMBB(overflowMBB);
25044 // In offsetMBB, emit code to use the reg_save_area.
25046 assert(OffsetReg != 0);
25048 // Read the reg_save_area address.
25049 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25050 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25056 .setMemRefs(MMOBegin, MMOEnd);
25058 // Zero-extend the offset
25059 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25060 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25063 .addImm(X86::sub_32bit);
25065 // Add the offset to the reg_save_area to get the final address.
25066 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25067 .addReg(OffsetReg64)
25068 .addReg(RegSaveReg);
25070 // Compute the offset for the next argument
25071 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25072 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25074 .addImm(UseFPOffset ? 16 : 8);
25076 // Store it back into the va_list.
25077 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25081 .addDisp(Disp, UseFPOffset ? 4 : 0)
25083 .addReg(NextOffsetReg)
25084 .setMemRefs(MMOBegin, MMOEnd);
25087 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25092 // Emit code to use overflow area
25095 // Load the overflow_area address into a register.
25096 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25097 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25103 .setMemRefs(MMOBegin, MMOEnd);
25105 // If we need to align it, do so. Otherwise, just copy the address
25106 // to OverflowDestReg.
25108 // Align the overflow address
25109 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25110 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25112 // aligned_addr = (addr + (align-1)) & ~(align-1)
25113 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25114 .addReg(OverflowAddrReg)
25117 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25119 .addImm(~(uint64_t)(Align-1));
25121 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25122 .addReg(OverflowAddrReg);
25125 // Compute the next overflow address after this argument.
25126 // (the overflow address should be kept 8-byte aligned)
25127 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25128 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25129 .addReg(OverflowDestReg)
25130 .addImm(ArgSizeA8);
25132 // Store the new overflow address.
25133 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25139 .addReg(NextAddrReg)
25140 .setMemRefs(MMOBegin, MMOEnd);
25142 // If we branched, emit the PHI to the front of endMBB.
25144 BuildMI(*endMBB, endMBB->begin(), DL,
25145 TII->get(X86::PHI), DestReg)
25146 .addReg(OffsetDestReg).addMBB(offsetMBB)
25147 .addReg(OverflowDestReg).addMBB(overflowMBB);
25150 // Erase the pseudo instruction
25151 MI.eraseFromParent();
25156 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25157 MachineInstr &MI, MachineBasicBlock *MBB) const {
25158 // Emit code to save XMM registers to the stack. The ABI says that the
25159 // number of registers to save is given in %al, so it's theoretically
25160 // possible to do an indirect jump trick to avoid saving all of them,
25161 // however this code takes a simpler approach and just executes all
25162 // of the stores if %al is non-zero. It's less code, and it's probably
25163 // easier on the hardware branch predictor, and stores aren't all that
25164 // expensive anyway.
25166 // Create the new basic blocks. One block contains all the XMM stores,
25167 // and one block is the final destination regardless of whether any
25168 // stores were performed.
25169 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25170 MachineFunction *F = MBB->getParent();
25171 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25172 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25173 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25174 F->insert(MBBIter, XMMSaveMBB);
25175 F->insert(MBBIter, EndMBB);
25177 // Transfer the remainder of MBB and its successor edges to EndMBB.
25178 EndMBB->splice(EndMBB->begin(), MBB,
25179 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25180 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25182 // The original block will now fall through to the XMM save block.
25183 MBB->addSuccessor(XMMSaveMBB);
25184 // The XMMSaveMBB will fall through to the end block.
25185 XMMSaveMBB->addSuccessor(EndMBB);
25187 // Now add the instructions.
25188 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25189 DebugLoc DL = MI.getDebugLoc();
25191 unsigned CountReg = MI.getOperand(0).getReg();
25192 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25193 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25195 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25196 // If %al is 0, branch around the XMM save block.
25197 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25198 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25199 MBB->addSuccessor(EndMBB);
25202 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25203 // that was just emitted, but clearly shouldn't be "saved".
25204 assert((MI.getNumOperands() <= 3 ||
25205 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25206 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25207 "Expected last argument to be EFLAGS");
25208 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25209 // In the XMM save block, save all the XMM argument registers.
25210 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25211 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25212 MachineMemOperand *MMO = F->getMachineMemOperand(
25213 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25214 MachineMemOperand::MOStore,
25215 /*Size=*/16, /*Align=*/16);
25216 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25217 .addFrameIndex(RegSaveFrameIndex)
25218 .addImm(/*Scale=*/1)
25219 .addReg(/*IndexReg=*/0)
25220 .addImm(/*Disp=*/Offset)
25221 .addReg(/*Segment=*/0)
25222 .addReg(MI.getOperand(i).getReg())
25223 .addMemOperand(MMO);
25226 MI.eraseFromParent(); // The pseudo instruction is gone now.
25231 // The EFLAGS operand of SelectItr might be missing a kill marker
25232 // because there were multiple uses of EFLAGS, and ISel didn't know
25233 // which to mark. Figure out whether SelectItr should have had a
25234 // kill marker, and set it if it should. Returns the correct kill
25236 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25237 MachineBasicBlock* BB,
25238 const TargetRegisterInfo* TRI) {
25239 // Scan forward through BB for a use/def of EFLAGS.
25240 MachineBasicBlock::iterator miI(std::next(SelectItr));
25241 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25242 const MachineInstr& mi = *miI;
25243 if (mi.readsRegister(X86::EFLAGS))
25245 if (mi.definesRegister(X86::EFLAGS))
25246 break; // Should have kill-flag - update below.
25249 // If we hit the end of the block, check whether EFLAGS is live into a
25251 if (miI == BB->end()) {
25252 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25253 sEnd = BB->succ_end();
25254 sItr != sEnd; ++sItr) {
25255 MachineBasicBlock* succ = *sItr;
25256 if (succ->isLiveIn(X86::EFLAGS))
25261 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25262 // out. SelectMI should have a kill flag on EFLAGS.
25263 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25267 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25268 // together with other CMOV pseudo-opcodes into a single basic-block with
25269 // conditional jump around it.
25270 static bool isCMOVPseudo(MachineInstr &MI) {
25271 switch (MI.getOpcode()) {
25272 case X86::CMOV_FR32:
25273 case X86::CMOV_FR64:
25274 case X86::CMOV_GR8:
25275 case X86::CMOV_GR16:
25276 case X86::CMOV_GR32:
25277 case X86::CMOV_RFP32:
25278 case X86::CMOV_RFP64:
25279 case X86::CMOV_RFP80:
25280 case X86::CMOV_V2F64:
25281 case X86::CMOV_V2I64:
25282 case X86::CMOV_V4F32:
25283 case X86::CMOV_V4F64:
25284 case X86::CMOV_V4I64:
25285 case X86::CMOV_V16F32:
25286 case X86::CMOV_V8F32:
25287 case X86::CMOV_V8F64:
25288 case X86::CMOV_V8I64:
25289 case X86::CMOV_V8I1:
25290 case X86::CMOV_V16I1:
25291 case X86::CMOV_V32I1:
25292 case X86::CMOV_V64I1:
25300 MachineBasicBlock *
25301 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25302 MachineBasicBlock *BB) const {
25303 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25304 DebugLoc DL = MI.getDebugLoc();
25306 // To "insert" a SELECT_CC instruction, we actually have to insert the
25307 // diamond control-flow pattern. The incoming instruction knows the
25308 // destination vreg to set, the condition code register to branch on, the
25309 // true/false values to select between, and a branch opcode to use.
25310 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25311 MachineFunction::iterator It = ++BB->getIterator();
25316 // cmpTY ccX, r1, r2
25318 // fallthrough --> copy0MBB
25319 MachineBasicBlock *thisMBB = BB;
25320 MachineFunction *F = BB->getParent();
25322 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25323 // as described above, by inserting a BB, and then making a PHI at the join
25324 // point to select the true and false operands of the CMOV in the PHI.
25326 // The code also handles two different cases of multiple CMOV opcodes
25330 // In this case, there are multiple CMOVs in a row, all which are based on
25331 // the same condition setting (or the exact opposite condition setting).
25332 // In this case we can lower all the CMOVs using a single inserted BB, and
25333 // then make a number of PHIs at the join point to model the CMOVs. The only
25334 // trickiness here, is that in a case like:
25336 // t2 = CMOV cond1 t1, f1
25337 // t3 = CMOV cond1 t2, f2
25339 // when rewriting this into PHIs, we have to perform some renaming on the
25340 // temps since you cannot have a PHI operand refer to a PHI result earlier
25341 // in the same block. The "simple" but wrong lowering would be:
25343 // t2 = PHI t1(BB1), f1(BB2)
25344 // t3 = PHI t2(BB1), f2(BB2)
25346 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25347 // renaming is to note that on the path through BB1, t2 is really just a
25348 // copy of t1, and do that renaming, properly generating:
25350 // t2 = PHI t1(BB1), f1(BB2)
25351 // t3 = PHI t1(BB1), f2(BB2)
25353 // Case 2, we lower cascaded CMOVs such as
25355 // (CMOV (CMOV F, T, cc1), T, cc2)
25357 // to two successive branches. For that, we look for another CMOV as the
25358 // following instruction.
25360 // Without this, we would add a PHI between the two jumps, which ends up
25361 // creating a few copies all around. For instance, for
25363 // (sitofp (zext (fcmp une)))
25365 // we would generate:
25367 // ucomiss %xmm1, %xmm0
25368 // movss <1.0f>, %xmm0
25369 // movaps %xmm0, %xmm1
25371 // xorps %xmm1, %xmm1
25374 // movaps %xmm1, %xmm0
25378 // because this custom-inserter would have generated:
25390 // A: X = ...; Y = ...
25392 // C: Z = PHI [X, A], [Y, B]
25394 // E: PHI [X, C], [Z, D]
25396 // If we lower both CMOVs in a single step, we can instead generate:
25408 // A: X = ...; Y = ...
25410 // E: PHI [X, A], [X, C], [Y, D]
25412 // Which, in our sitofp/fcmp example, gives us something like:
25414 // ucomiss %xmm1, %xmm0
25415 // movss <1.0f>, %xmm0
25418 // xorps %xmm0, %xmm0
25422 MachineInstr *CascadedCMOV = nullptr;
25423 MachineInstr *LastCMOV = &MI;
25424 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25425 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25426 MachineBasicBlock::iterator NextMIIt =
25427 std::next(MachineBasicBlock::iterator(MI));
25429 // Check for case 1, where there are multiple CMOVs with the same condition
25430 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
25431 // number of jumps the most.
25433 if (isCMOVPseudo(MI)) {
25434 // See if we have a string of CMOVS with the same condition.
25435 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25436 (NextMIIt->getOperand(3).getImm() == CC ||
25437 NextMIIt->getOperand(3).getImm() == OppCC)) {
25438 LastCMOV = &*NextMIIt;
25443 // This checks for case 2, but only do this if we didn't already find
25444 // case 1, as indicated by LastCMOV == MI.
25445 if (LastCMOV == &MI && NextMIIt != BB->end() &&
25446 NextMIIt->getOpcode() == MI.getOpcode() &&
25447 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25448 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25449 NextMIIt->getOperand(1).isKill()) {
25450 CascadedCMOV = &*NextMIIt;
25453 MachineBasicBlock *jcc1MBB = nullptr;
25455 // If we have a cascaded CMOV, we lower it to two successive branches to
25456 // the same block. EFLAGS is used by both, so mark it as live in the second.
25457 if (CascadedCMOV) {
25458 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25459 F->insert(It, jcc1MBB);
25460 jcc1MBB->addLiveIn(X86::EFLAGS);
25463 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25464 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25465 F->insert(It, copy0MBB);
25466 F->insert(It, sinkMBB);
25468 // If the EFLAGS register isn't dead in the terminator, then claim that it's
25469 // live into the sink and copy blocks.
25470 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25472 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25473 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25474 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25475 copy0MBB->addLiveIn(X86::EFLAGS);
25476 sinkMBB->addLiveIn(X86::EFLAGS);
25479 // Transfer the remainder of BB and its successor edges to sinkMBB.
25480 sinkMBB->splice(sinkMBB->begin(), BB,
25481 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25482 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25484 // Add the true and fallthrough blocks as its successors.
25485 if (CascadedCMOV) {
25486 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25487 BB->addSuccessor(jcc1MBB);
25489 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25490 // jump to the sinkMBB.
25491 jcc1MBB->addSuccessor(copy0MBB);
25492 jcc1MBB->addSuccessor(sinkMBB);
25494 BB->addSuccessor(copy0MBB);
25497 // The true block target of the first (or only) branch is always sinkMBB.
25498 BB->addSuccessor(sinkMBB);
25500 // Create the conditional branch instruction.
25501 unsigned Opc = X86::GetCondBranchFromCond(CC);
25502 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25504 if (CascadedCMOV) {
25505 unsigned Opc2 = X86::GetCondBranchFromCond(
25506 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25507 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25511 // %FalseValue = ...
25512 // # fallthrough to sinkMBB
25513 copy0MBB->addSuccessor(sinkMBB);
25516 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25518 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25519 MachineBasicBlock::iterator MIItEnd =
25520 std::next(MachineBasicBlock::iterator(LastCMOV));
25521 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25522 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25523 MachineInstrBuilder MIB;
25525 // As we are creating the PHIs, we have to be careful if there is more than
25526 // one. Later CMOVs may reference the results of earlier CMOVs, but later
25527 // PHIs have to reference the individual true/false inputs from earlier PHIs.
25528 // That also means that PHI construction must work forward from earlier to
25529 // later, and that the code must maintain a mapping from earlier PHI's
25530 // destination registers, and the registers that went into the PHI.
25532 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25533 unsigned DestReg = MIIt->getOperand(0).getReg();
25534 unsigned Op1Reg = MIIt->getOperand(1).getReg();
25535 unsigned Op2Reg = MIIt->getOperand(2).getReg();
25537 // If this CMOV we are generating is the opposite condition from
25538 // the jump we generated, then we have to swap the operands for the
25539 // PHI that is going to be generated.
25540 if (MIIt->getOperand(3).getImm() == OppCC)
25541 std::swap(Op1Reg, Op2Reg);
25543 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25544 Op1Reg = RegRewriteTable[Op1Reg].first;
25546 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25547 Op2Reg = RegRewriteTable[Op2Reg].second;
25549 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25550 TII->get(X86::PHI), DestReg)
25551 .addReg(Op1Reg).addMBB(copy0MBB)
25552 .addReg(Op2Reg).addMBB(thisMBB);
25554 // Add this PHI to the rewrite table.
25555 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25558 // If we have a cascaded CMOV, the second Jcc provides the same incoming
25559 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25560 if (CascadedCMOV) {
25561 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25562 // Copy the PHI result to the register defined by the second CMOV.
25563 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25564 DL, TII->get(TargetOpcode::COPY),
25565 CascadedCMOV->getOperand(0).getReg())
25566 .addReg(MI.getOperand(0).getReg());
25567 CascadedCMOV->eraseFromParent();
25570 // Now remove the CMOV(s).
25571 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25572 (MIIt++)->eraseFromParent();
25577 MachineBasicBlock *
25578 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25579 MachineBasicBlock *BB) const {
25580 // Combine the following atomic floating-point modification pattern:
25581 // a.store(reg OP a.load(acquire), release)
25582 // Transform them into:
25583 // OPss (%gpr), %xmm
25584 // movss %xmm, (%gpr)
25585 // Or sd equivalent for 64-bit operations.
25587 switch (MI.getOpcode()) {
25588 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25589 case X86::RELEASE_FADD32mr:
25590 FOp = X86::ADDSSrm;
25591 MOp = X86::MOVSSmr;
25593 case X86::RELEASE_FADD64mr:
25594 FOp = X86::ADDSDrm;
25595 MOp = X86::MOVSDmr;
25598 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25599 DebugLoc DL = MI.getDebugLoc();
25600 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25601 unsigned ValOpIdx = X86::AddrNumOperands;
25602 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25603 MachineInstrBuilder MIB =
25604 BuildMI(*BB, MI, DL, TII->get(FOp),
25605 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25607 for (int i = 0; i < X86::AddrNumOperands; ++i) {
25608 MachineOperand &Operand = MI.getOperand(i);
25609 // Clear any kill flags on register operands as we'll create a second
25610 // instruction using the same address operands.
25611 if (Operand.isReg())
25612 Operand.setIsKill(false);
25615 MachineInstr *FOpMI = MIB;
25616 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25617 for (int i = 0; i < X86::AddrNumOperands; ++i)
25618 MIB.add(MI.getOperand(i));
25619 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25620 MI.eraseFromParent(); // The pseudo instruction is gone now.
25624 MachineBasicBlock *
25625 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25626 MachineBasicBlock *BB) const {
25627 MachineFunction *MF = BB->getParent();
25628 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25629 DebugLoc DL = MI.getDebugLoc();
25630 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25632 assert(MF->shouldSplitStack());
25634 const bool Is64Bit = Subtarget.is64Bit();
25635 const bool IsLP64 = Subtarget.isTarget64BitLP64();
25637 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25638 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25641 // ... [Till the alloca]
25642 // If stacklet is not large enough, jump to mallocMBB
25645 // Allocate by subtracting from RSP
25646 // Jump to continueMBB
25649 // Allocate by call to runtime
25653 // [rest of original BB]
25656 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25657 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25658 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25660 MachineRegisterInfo &MRI = MF->getRegInfo();
25661 const TargetRegisterClass *AddrRegClass =
25662 getRegClassFor(getPointerTy(MF->getDataLayout()));
25664 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25665 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25666 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25667 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25668 sizeVReg = MI.getOperand(1).getReg(),
25670 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25672 MachineFunction::iterator MBBIter = ++BB->getIterator();
25674 MF->insert(MBBIter, bumpMBB);
25675 MF->insert(MBBIter, mallocMBB);
25676 MF->insert(MBBIter, continueMBB);
25678 continueMBB->splice(continueMBB->begin(), BB,
25679 std::next(MachineBasicBlock::iterator(MI)), BB->end());
25680 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
25682 // Add code to the main basic block to check if the stack limit has been hit,
25683 // and if so, jump to mallocMBB otherwise to bumpMBB.
25684 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
25685 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
25686 .addReg(tmpSPVReg).addReg(sizeVReg);
25687 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
25688 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
25689 .addReg(SPLimitVReg);
25690 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
25692 // bumpMBB simply decreases the stack pointer, since we know the current
25693 // stacklet has enough space.
25694 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
25695 .addReg(SPLimitVReg);
25696 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
25697 .addReg(SPLimitVReg);
25698 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25700 // Calls into a routine in libgcc to allocate more space from the heap.
25701 const uint32_t *RegMask =
25702 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
25704 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
25706 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25707 .addExternalSymbol("__morestack_allocate_stack_space")
25708 .addRegMask(RegMask)
25709 .addReg(X86::RDI, RegState::Implicit)
25710 .addReg(X86::RAX, RegState::ImplicitDefine);
25711 } else if (Is64Bit) {
25712 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
25714 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25715 .addExternalSymbol("__morestack_allocate_stack_space")
25716 .addRegMask(RegMask)
25717 .addReg(X86::EDI, RegState::Implicit)
25718 .addReg(X86::EAX, RegState::ImplicitDefine);
25720 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
25722 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
25723 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
25724 .addExternalSymbol("__morestack_allocate_stack_space")
25725 .addRegMask(RegMask)
25726 .addReg(X86::EAX, RegState::ImplicitDefine);
25730 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
25733 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
25734 .addReg(IsLP64 ? X86::RAX : X86::EAX);
25735 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25737 // Set up the CFG correctly.
25738 BB->addSuccessor(bumpMBB);
25739 BB->addSuccessor(mallocMBB);
25740 mallocMBB->addSuccessor(continueMBB);
25741 bumpMBB->addSuccessor(continueMBB);
25743 // Take care of the PHI nodes.
25744 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
25745 MI.getOperand(0).getReg())
25746 .addReg(mallocPtrVReg)
25748 .addReg(bumpSPPtrVReg)
25751 // Delete the original pseudo instruction.
25752 MI.eraseFromParent();
25755 return continueMBB;
25758 MachineBasicBlock *
25759 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
25760 MachineBasicBlock *BB) const {
25761 MachineFunction *MF = BB->getParent();
25762 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25763 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
25764 DebugLoc DL = MI.getDebugLoc();
25766 assert(!isAsynchronousEHPersonality(
25767 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
25768 "SEH does not use catchret!");
25770 // Only 32-bit EH needs to worry about manually restoring stack pointers.
25771 if (!Subtarget.is32Bit())
25774 // C++ EH creates a new target block to hold the restore code, and wires up
25775 // the new block to the return destination with a normal JMP_4.
25776 MachineBasicBlock *RestoreMBB =
25777 MF->CreateMachineBasicBlock(BB->getBasicBlock());
25778 assert(BB->succ_size() == 1);
25779 MF->insert(std::next(BB->getIterator()), RestoreMBB);
25780 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
25781 BB->addSuccessor(RestoreMBB);
25782 MI.getOperand(0).setMBB(RestoreMBB);
25784 auto RestoreMBBI = RestoreMBB->begin();
25785 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
25786 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25790 MachineBasicBlock *
25791 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25792 MachineBasicBlock *BB) const {
25793 MachineFunction *MF = BB->getParent();
25794 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25795 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25796 // Only 32-bit SEH requires special handling for catchpad.
25797 if (IsSEH && Subtarget.is32Bit()) {
25798 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25799 DebugLoc DL = MI.getDebugLoc();
25800 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
25802 MI.eraseFromParent();
25806 MachineBasicBlock *
25807 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
25808 MachineBasicBlock *BB) const {
25809 // So, here we replace TLSADDR with the sequence:
25810 // adjust_stackdown -> TLSADDR -> adjust_stackup.
25811 // We need this because TLSADDR is lowered into calls
25812 // inside MC, therefore without the two markers shrink-wrapping
25813 // may push the prologue/epilogue pass them.
25814 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25815 DebugLoc DL = MI.getDebugLoc();
25816 MachineFunction &MF = *BB->getParent();
25818 // Emit CALLSEQ_START right before the instruction.
25819 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
25820 MachineInstrBuilder CallseqStart =
25821 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
25822 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
25824 // Emit CALLSEQ_END right after the instruction.
25825 // We don't call erase from parent because we want to keep the
25826 // original instruction around.
25827 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
25828 MachineInstrBuilder CallseqEnd =
25829 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
25830 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
25835 MachineBasicBlock *
25836 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
25837 MachineBasicBlock *BB) const {
25838 // This is pretty easy. We're taking the value that we received from
25839 // our load from the relocation, sticking it in either RDI (x86-64)
25840 // or EAX and doing an indirect call. The return value will then
25841 // be in the normal return register.
25842 MachineFunction *F = BB->getParent();
25843 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25844 DebugLoc DL = MI.getDebugLoc();
25846 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
25847 assert(MI.getOperand(3).isGlobal() && "This should be a global");
25849 // Get a register mask for the lowered call.
25850 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
25851 // proper register mask.
25852 const uint32_t *RegMask =
25853 Subtarget.is64Bit() ?
25854 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
25855 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
25856 if (Subtarget.is64Bit()) {
25857 MachineInstrBuilder MIB =
25858 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
25862 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25863 MI.getOperand(3).getTargetFlags())
25865 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
25866 addDirectMem(MIB, X86::RDI);
25867 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
25868 } else if (!isPositionIndependent()) {
25869 MachineInstrBuilder MIB =
25870 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25874 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25875 MI.getOperand(3).getTargetFlags())
25877 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25878 addDirectMem(MIB, X86::EAX);
25879 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25881 MachineInstrBuilder MIB =
25882 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25883 .addReg(TII->getGlobalBaseReg(F))
25886 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25887 MI.getOperand(3).getTargetFlags())
25889 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25890 addDirectMem(MIB, X86::EAX);
25891 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25894 MI.eraseFromParent(); // The pseudo instruction is gone now.
25898 MachineBasicBlock *
25899 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
25900 MachineBasicBlock *MBB) const {
25901 DebugLoc DL = MI.getDebugLoc();
25902 MachineFunction *MF = MBB->getParent();
25903 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25904 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25905 MachineRegisterInfo &MRI = MF->getRegInfo();
25907 const BasicBlock *BB = MBB->getBasicBlock();
25908 MachineFunction::iterator I = ++MBB->getIterator();
25910 // Memory Reference
25911 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25912 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25915 unsigned MemOpndSlot = 0;
25917 unsigned CurOp = 0;
25919 DstReg = MI.getOperand(CurOp++).getReg();
25920 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25921 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
25923 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25924 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
25926 MemOpndSlot = CurOp;
25928 MVT PVT = getPointerTy(MF->getDataLayout());
25929 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25930 "Invalid Pointer Size!");
25932 // For v = setjmp(buf), we generate
25935 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
25936 // SjLjSetup restoreMBB
25942 // v = phi(main, restore)
25945 // if base pointer being used, load it from frame
25948 MachineBasicBlock *thisMBB = MBB;
25949 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25950 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25951 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
25952 MF->insert(I, mainMBB);
25953 MF->insert(I, sinkMBB);
25954 MF->push_back(restoreMBB);
25955 restoreMBB->setHasAddressTaken();
25957 MachineInstrBuilder MIB;
25959 // Transfer the remainder of BB and its successor edges to sinkMBB.
25960 sinkMBB->splice(sinkMBB->begin(), MBB,
25961 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25962 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25965 unsigned PtrStoreOpc = 0;
25966 unsigned LabelReg = 0;
25967 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25968 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25969 !isPositionIndependent();
25971 // Prepare IP either in reg or imm.
25972 if (!UseImmLabel) {
25973 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25974 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
25975 LabelReg = MRI.createVirtualRegister(PtrRC);
25976 if (Subtarget.is64Bit()) {
25977 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
25981 .addMBB(restoreMBB)
25984 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
25985 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
25986 .addReg(XII->getGlobalBaseReg(MF))
25989 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
25993 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25995 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
25996 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25997 if (i == X86::AddrDisp)
25998 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26000 MIB.add(MI.getOperand(MemOpndSlot + i));
26003 MIB.addReg(LabelReg);
26005 MIB.addMBB(restoreMBB);
26006 MIB.setMemRefs(MMOBegin, MMOEnd);
26008 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26009 .addMBB(restoreMBB);
26011 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26012 MIB.addRegMask(RegInfo->getNoPreservedMask());
26013 thisMBB->addSuccessor(mainMBB);
26014 thisMBB->addSuccessor(restoreMBB);
26018 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26019 mainMBB->addSuccessor(sinkMBB);
26022 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26023 TII->get(X86::PHI), DstReg)
26024 .addReg(mainDstReg).addMBB(mainMBB)
26025 .addReg(restoreDstReg).addMBB(restoreMBB);
26028 if (RegInfo->hasBasePointer(*MF)) {
26029 const bool Uses64BitFramePtr =
26030 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26031 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26032 X86FI->setRestoreBasePointer(MF);
26033 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26034 unsigned BasePtr = RegInfo->getBaseRegister();
26035 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26036 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26037 FramePtr, true, X86FI->getRestoreBasePointerOffset())
26038 .setMIFlag(MachineInstr::FrameSetup);
26040 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26041 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26042 restoreMBB->addSuccessor(sinkMBB);
26044 MI.eraseFromParent();
26048 MachineBasicBlock *
26049 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26050 MachineBasicBlock *MBB) const {
26051 DebugLoc DL = MI.getDebugLoc();
26052 MachineFunction *MF = MBB->getParent();
26053 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26054 MachineRegisterInfo &MRI = MF->getRegInfo();
26056 // Memory Reference
26057 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26058 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26060 MVT PVT = getPointerTy(MF->getDataLayout());
26061 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26062 "Invalid Pointer Size!");
26064 const TargetRegisterClass *RC =
26065 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26066 unsigned Tmp = MRI.createVirtualRegister(RC);
26067 // Since FP is only updated here but NOT referenced, it's treated as GPR.
26068 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26069 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26070 unsigned SP = RegInfo->getStackRegister();
26072 MachineInstrBuilder MIB;
26074 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26075 const int64_t SPOffset = 2 * PVT.getStoreSize();
26077 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26078 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26081 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26082 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26083 MIB.add(MI.getOperand(i));
26084 MIB.setMemRefs(MMOBegin, MMOEnd);
26086 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26087 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26088 if (i == X86::AddrDisp)
26089 MIB.addDisp(MI.getOperand(i), LabelOffset);
26091 MIB.add(MI.getOperand(i));
26093 MIB.setMemRefs(MMOBegin, MMOEnd);
26095 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26096 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26097 if (i == X86::AddrDisp)
26098 MIB.addDisp(MI.getOperand(i), SPOffset);
26100 MIB.add(MI.getOperand(i));
26102 MIB.setMemRefs(MMOBegin, MMOEnd);
26104 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26106 MI.eraseFromParent();
26110 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26111 MachineBasicBlock *MBB,
26112 MachineBasicBlock *DispatchBB,
26114 DebugLoc DL = MI.getDebugLoc();
26115 MachineFunction *MF = MBB->getParent();
26116 MachineRegisterInfo *MRI = &MF->getRegInfo();
26117 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26119 MVT PVT = getPointerTy(MF->getDataLayout());
26120 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26125 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26126 !isPositionIndependent();
26129 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26131 const TargetRegisterClass *TRC =
26132 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26133 VR = MRI->createVirtualRegister(TRC);
26134 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26136 if (Subtarget.is64Bit())
26137 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26141 .addMBB(DispatchBB)
26144 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26145 .addReg(0) /* TII->getGlobalBaseReg(MF) */
26148 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26152 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26153 addFrameReference(MIB, FI, 36);
26155 MIB.addMBB(DispatchBB);
26160 MachineBasicBlock *
26161 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26162 MachineBasicBlock *BB) const {
26163 DebugLoc DL = MI.getDebugLoc();
26164 MachineFunction *MF = BB->getParent();
26165 MachineFrameInfo &MFI = MF->getFrameInfo();
26166 MachineRegisterInfo *MRI = &MF->getRegInfo();
26167 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26168 int FI = MFI.getFunctionContextIndex();
26170 // Get a mapping of the call site numbers to all of the landing pads they're
26171 // associated with.
26172 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26173 unsigned MaxCSNum = 0;
26174 for (auto &MBB : *MF) {
26175 if (!MBB.isEHPad())
26178 MCSymbol *Sym = nullptr;
26179 for (const auto &MI : MBB) {
26180 if (MI.isDebugValue())
26183 assert(MI.isEHLabel() && "expected EH_LABEL");
26184 Sym = MI.getOperand(0).getMCSymbol();
26188 if (!MF->hasCallSiteLandingPad(Sym))
26191 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26192 CallSiteNumToLPad[CSI].push_back(&MBB);
26193 MaxCSNum = std::max(MaxCSNum, CSI);
26197 // Get an ordered list of the machine basic blocks for the jump table.
26198 std::vector<MachineBasicBlock *> LPadList;
26199 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26200 LPadList.reserve(CallSiteNumToLPad.size());
26202 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26203 for (auto &LP : CallSiteNumToLPad[CSI]) {
26204 LPadList.push_back(LP);
26205 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26209 assert(!LPadList.empty() &&
26210 "No landing pad destinations for the dispatch jump table!");
26212 // Create the MBBs for the dispatch code.
26214 // Shove the dispatch's address into the return slot in the function context.
26215 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26216 DispatchBB->setIsEHPad(true);
26218 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26219 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26220 DispatchBB->addSuccessor(TrapBB);
26222 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26223 DispatchBB->addSuccessor(DispContBB);
26226 MF->push_back(DispatchBB);
26227 MF->push_back(DispContBB);
26228 MF->push_back(TrapBB);
26230 // Insert code into the entry block that creates and registers the function
26232 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26234 // Create the jump table and associated information
26235 MachineJumpTableInfo *JTI =
26236 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26237 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26239 const X86RegisterInfo &RI = TII->getRegisterInfo();
26240 // Add a register mask with no preserved registers. This results in all
26241 // registers being marked as clobbered.
26242 if (RI.hasBasePointer(*MF)) {
26243 const bool FPIs64Bit =
26244 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26245 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26246 MFI->setRestoreBasePointer(MF);
26248 unsigned FP = RI.getFrameRegister(*MF);
26249 unsigned BP = RI.getBaseRegister();
26250 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26251 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26252 MFI->getRestoreBasePointerOffset())
26253 .addRegMask(RI.getNoPreservedMask());
26255 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26256 .addRegMask(RI.getNoPreservedMask());
26259 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26260 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26262 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26264 .addImm(LPadList.size());
26265 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26267 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26268 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26271 BuildMI(DispContBB, DL,
26272 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26274 .addImm(Subtarget.is64Bit() ? 8 : 4)
26276 .addJumpTableIndex(MJTI)
26279 // Add the jump table entries as successors to the MBB.
26280 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26281 for (auto &LP : LPadList)
26282 if (SeenMBBs.insert(LP).second)
26283 DispContBB->addSuccessor(LP);
26285 // N.B. the order the invoke BBs are processed in doesn't matter here.
26286 SmallVector<MachineBasicBlock *, 64> MBBLPads;
26287 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26288 for (MachineBasicBlock *MBB : InvokeBBs) {
26289 // Remove the landing pad successor from the invoke block and replace it
26290 // with the new dispatch block.
26291 // Keep a copy of Successors since it's modified inside the loop.
26292 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26294 // FIXME: Avoid quadratic complexity.
26295 for (auto MBBS : Successors) {
26296 if (MBBS->isEHPad()) {
26297 MBB->removeSuccessor(MBBS);
26298 MBBLPads.push_back(MBBS);
26302 MBB->addSuccessor(DispatchBB);
26304 // Find the invoke call and mark all of the callee-saved registers as
26305 // 'implicit defined' so that they're spilled. This prevents code from
26306 // moving instructions to before the EH block, where they will never be
26308 for (auto &II : reverse(*MBB)) {
26312 DenseMap<unsigned, bool> DefRegs;
26313 for (auto &MOp : II.operands())
26315 DefRegs[MOp.getReg()] = true;
26317 MachineInstrBuilder MIB(*MF, &II);
26318 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26319 unsigned Reg = SavedRegs[RI];
26321 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26328 // Mark all former landing pads as non-landing pads. The dispatch is the only
26329 // landing pad now.
26330 for (auto &LP : MBBLPads)
26331 LP->setIsEHPad(false);
26333 // The instruction is gone now.
26334 MI.eraseFromParent();
26338 MachineBasicBlock *
26339 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26340 MachineBasicBlock *BB) const {
26341 MachineFunction *MF = BB->getParent();
26342 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26343 DebugLoc DL = MI.getDebugLoc();
26345 switch (MI.getOpcode()) {
26346 default: llvm_unreachable("Unexpected instr type to insert");
26347 case X86::TAILJMPd64:
26348 case X86::TAILJMPr64:
26349 case X86::TAILJMPm64:
26350 case X86::TAILJMPr64_REX:
26351 case X86::TAILJMPm64_REX:
26352 llvm_unreachable("TAILJMP64 would not be touched here.");
26353 case X86::TCRETURNdi64:
26354 case X86::TCRETURNri64:
26355 case X86::TCRETURNmi64:
26357 case X86::TLS_addr32:
26358 case X86::TLS_addr64:
26359 case X86::TLS_base_addr32:
26360 case X86::TLS_base_addr64:
26361 return EmitLoweredTLSAddr(MI, BB);
26362 case X86::CATCHRET:
26363 return EmitLoweredCatchRet(MI, BB);
26364 case X86::CATCHPAD:
26365 return EmitLoweredCatchPad(MI, BB);
26366 case X86::SEG_ALLOCA_32:
26367 case X86::SEG_ALLOCA_64:
26368 return EmitLoweredSegAlloca(MI, BB);
26369 case X86::TLSCall_32:
26370 case X86::TLSCall_64:
26371 return EmitLoweredTLSCall(MI, BB);
26372 case X86::CMOV_FR32:
26373 case X86::CMOV_FR64:
26374 case X86::CMOV_FR128:
26375 case X86::CMOV_GR8:
26376 case X86::CMOV_GR16:
26377 case X86::CMOV_GR32:
26378 case X86::CMOV_RFP32:
26379 case X86::CMOV_RFP64:
26380 case X86::CMOV_RFP80:
26381 case X86::CMOV_V2F64:
26382 case X86::CMOV_V2I64:
26383 case X86::CMOV_V4F32:
26384 case X86::CMOV_V4F64:
26385 case X86::CMOV_V4I64:
26386 case X86::CMOV_V16F32:
26387 case X86::CMOV_V8F32:
26388 case X86::CMOV_V8F64:
26389 case X86::CMOV_V8I64:
26390 case X86::CMOV_V8I1:
26391 case X86::CMOV_V16I1:
26392 case X86::CMOV_V32I1:
26393 case X86::CMOV_V64I1:
26394 return EmitLoweredSelect(MI, BB);
26396 case X86::RDFLAGS32:
26397 case X86::RDFLAGS64: {
26399 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26400 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26401 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26402 // Permit reads of the FLAGS register without it being defined.
26403 // This intrinsic exists to read external processor state in flags, such as
26404 // the trap flag, interrupt flag, and direction flag, none of which are
26405 // modeled by the backend.
26406 Push->getOperand(2).setIsUndef();
26407 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26409 MI.eraseFromParent(); // The pseudo is gone now.
26413 case X86::WRFLAGS32:
26414 case X86::WRFLAGS64: {
26416 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26418 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26419 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26420 BuildMI(*BB, MI, DL, TII->get(PopF));
26422 MI.eraseFromParent(); // The pseudo is gone now.
26426 case X86::RELEASE_FADD32mr:
26427 case X86::RELEASE_FADD64mr:
26428 return EmitLoweredAtomicFP(MI, BB);
26430 case X86::FP32_TO_INT16_IN_MEM:
26431 case X86::FP32_TO_INT32_IN_MEM:
26432 case X86::FP32_TO_INT64_IN_MEM:
26433 case X86::FP64_TO_INT16_IN_MEM:
26434 case X86::FP64_TO_INT32_IN_MEM:
26435 case X86::FP64_TO_INT64_IN_MEM:
26436 case X86::FP80_TO_INT16_IN_MEM:
26437 case X86::FP80_TO_INT32_IN_MEM:
26438 case X86::FP80_TO_INT64_IN_MEM: {
26439 // Change the floating point control register to use "round towards zero"
26440 // mode when truncating to an integer value.
26441 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26442 addFrameReference(BuildMI(*BB, MI, DL,
26443 TII->get(X86::FNSTCW16m)), CWFrameIdx);
26445 // Load the old value of the high byte of the control word...
26447 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26448 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26451 // Set the high part to be round to zero...
26452 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26455 // Reload the modified control word now...
26456 addFrameReference(BuildMI(*BB, MI, DL,
26457 TII->get(X86::FLDCW16m)), CWFrameIdx);
26459 // Restore the memory image of control word to original value
26460 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26463 // Get the X86 opcode to use.
26465 switch (MI.getOpcode()) {
26466 default: llvm_unreachable("illegal opcode!");
26467 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26468 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26469 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26470 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26471 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26472 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26473 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26474 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26475 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26478 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26479 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26480 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26482 // Reload the original control word now.
26483 addFrameReference(BuildMI(*BB, MI, DL,
26484 TII->get(X86::FLDCW16m)), CWFrameIdx);
26486 MI.eraseFromParent(); // The pseudo instruction is gone now.
26489 // String/text processing lowering.
26490 case X86::PCMPISTRM128REG:
26491 case X86::VPCMPISTRM128REG:
26492 case X86::PCMPISTRM128MEM:
26493 case X86::VPCMPISTRM128MEM:
26494 case X86::PCMPESTRM128REG:
26495 case X86::VPCMPESTRM128REG:
26496 case X86::PCMPESTRM128MEM:
26497 case X86::VPCMPESTRM128MEM:
26498 assert(Subtarget.hasSSE42() &&
26499 "Target must have SSE4.2 or AVX features enabled");
26500 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26502 // String/text processing lowering.
26503 case X86::PCMPISTRIREG:
26504 case X86::VPCMPISTRIREG:
26505 case X86::PCMPISTRIMEM:
26506 case X86::VPCMPISTRIMEM:
26507 case X86::PCMPESTRIREG:
26508 case X86::VPCMPESTRIREG:
26509 case X86::PCMPESTRIMEM:
26510 case X86::VPCMPESTRIMEM:
26511 assert(Subtarget.hasSSE42() &&
26512 "Target must have SSE4.2 or AVX features enabled");
26513 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26515 // Thread synchronization.
26517 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26518 case X86::MONITORX:
26519 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26523 return emitClzero(&MI, BB, Subtarget);
26527 return emitWRPKRU(MI, BB, Subtarget);
26529 return emitRDPKRU(MI, BB, Subtarget);
26532 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26534 case X86::VASTART_SAVE_XMM_REGS:
26535 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26537 case X86::VAARG_64:
26538 return EmitVAARG64WithCustomInserter(MI, BB);
26540 case X86::EH_SjLj_SetJmp32:
26541 case X86::EH_SjLj_SetJmp64:
26542 return emitEHSjLjSetJmp(MI, BB);
26544 case X86::EH_SjLj_LongJmp32:
26545 case X86::EH_SjLj_LongJmp64:
26546 return emitEHSjLjLongJmp(MI, BB);
26548 case X86::Int_eh_sjlj_setup_dispatch:
26549 return EmitSjLjDispatchBlock(MI, BB);
26551 case TargetOpcode::STATEPOINT:
26552 // As an implementation detail, STATEPOINT shares the STACKMAP format at
26553 // this point in the process. We diverge later.
26554 return emitPatchPoint(MI, BB);
26556 case TargetOpcode::STACKMAP:
26557 case TargetOpcode::PATCHPOINT:
26558 return emitPatchPoint(MI, BB);
26560 case TargetOpcode::PATCHABLE_EVENT_CALL:
26561 // Do nothing here, handle in xray instrumentation pass.
26564 case X86::LCMPXCHG8B: {
26565 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26566 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26567 // requires a memory operand. If it happens that current architecture is
26568 // i686 and for current function we need a base pointer
26569 // - which is ESI for i686 - register allocator would not be able to
26570 // allocate registers for an address in form of X(%reg, %reg, Y)
26571 // - there never would be enough unreserved registers during regalloc
26572 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26573 // We are giving a hand to register allocator by precomputing the address in
26574 // a new vreg using LEA.
26576 // If it is not i686 or there is no base pointer - nothing to do here.
26577 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26580 // Even though this code does not necessarily needs the base pointer to
26581 // be ESI, we check for that. The reason: if this assert fails, there are
26582 // some changes happened in the compiler base pointer handling, which most
26583 // probably have to be addressed somehow here.
26584 assert(TRI->getBaseRegister() == X86::ESI &&
26585 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26586 "base pointer in mind");
26588 MachineRegisterInfo &MRI = MF->getRegInfo();
26589 MVT SPTy = getPointerTy(MF->getDataLayout());
26590 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26591 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26593 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26594 // Regalloc does not need any help when the memory operand of CMPXCHG8B
26595 // does not use index register.
26596 if (AM.IndexReg == X86::NoRegister)
26599 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26600 // four operand definitions that are E[ABCD] registers. We skip them and
26601 // then insert the LEA.
26602 MachineBasicBlock::iterator MBBI(MI);
26603 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26604 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26607 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26609 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26613 case X86::LCMPXCHG16B:
26615 case X86::LCMPXCHG8B_SAVE_EBX:
26616 case X86::LCMPXCHG16B_SAVE_RBX: {
26618 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26619 if (!BB->isLiveIn(BasePtr))
26620 BB->addLiveIn(BasePtr);
26626 //===----------------------------------------------------------------------===//
26627 // X86 Optimization Hooks
26628 //===----------------------------------------------------------------------===//
26630 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26632 const APInt &DemandedElts,
26633 const SelectionDAG &DAG,
26634 unsigned Depth) const {
26635 unsigned BitWidth = Known.getBitWidth();
26636 unsigned Opc = Op.getOpcode();
26637 EVT VT = Op.getValueType();
26638 assert((Opc >= ISD::BUILTIN_OP_END ||
26639 Opc == ISD::INTRINSIC_WO_CHAIN ||
26640 Opc == ISD::INTRINSIC_W_CHAIN ||
26641 Opc == ISD::INTRINSIC_VOID) &&
26642 "Should use MaskedValueIsZero if you don't know whether Op"
26643 " is a target node!");
26659 // These nodes' second result is a boolean.
26660 if (Op.getResNo() == 0)
26663 case X86ISD::SETCC:
26664 Known.Zero.setBitsFrom(1);
26666 case X86ISD::MOVMSK: {
26667 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26668 Known.Zero.setBitsFrom(NumLoBits);
26671 case X86ISD::VSHLI:
26672 case X86ISD::VSRLI: {
26673 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26674 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
26675 Known.setAllZero();
26679 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
26680 unsigned ShAmt = ShiftImm->getZExtValue();
26681 if (Opc == X86ISD::VSHLI) {
26682 Known.Zero <<= ShAmt;
26683 Known.One <<= ShAmt;
26684 // Low bits are known zero.
26685 Known.Zero.setLowBits(ShAmt);
26687 Known.Zero.lshrInPlace(ShAmt);
26688 Known.One.lshrInPlace(ShAmt);
26689 // High bits are known zero.
26690 Known.Zero.setHighBits(ShAmt);
26695 case X86ISD::VZEXT: {
26696 SDValue N0 = Op.getOperand(0);
26697 unsigned NumElts = VT.getVectorNumElements();
26699 EVT SrcVT = N0.getValueType();
26700 unsigned InNumElts = SrcVT.getVectorNumElements();
26701 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
26702 assert(InNumElts >= NumElts && "Illegal VZEXT input");
26704 Known = KnownBits(InBitWidth);
26705 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
26706 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
26707 Known = Known.zext(BitWidth);
26708 Known.Zero.setBitsFrom(InBitWidth);
26714 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
26715 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
26716 unsigned Depth) const {
26717 unsigned VTBits = Op.getScalarValueSizeInBits();
26718 unsigned Opcode = Op.getOpcode();
26720 case X86ISD::SETCC_CARRY:
26721 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
26724 case X86ISD::VSEXT: {
26725 SDValue Src = Op.getOperand(0);
26726 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26727 Tmp += VTBits - Src.getScalarValueSizeInBits();
26731 case X86ISD::VSRAI: {
26732 SDValue Src = Op.getOperand(0);
26733 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26734 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
26736 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
26739 case X86ISD::PCMPGT:
26740 case X86ISD::PCMPEQ:
26742 case X86ISD::VPCOM:
26743 case X86ISD::VPCOMU:
26744 // Vector compares return zero/all-bits result values.
26752 /// Returns true (and the GlobalValue and the offset) if the node is a
26753 /// GlobalAddress + offset.
26754 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
26755 const GlobalValue* &GA,
26756 int64_t &Offset) const {
26757 if (N->getOpcode() == X86ISD::Wrapper) {
26758 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
26759 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
26760 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
26764 return TargetLowering::isGAPlusOffset(N, GA, Offset);
26767 // Attempt to match a combined shuffle mask against supported unary shuffle
26769 // TODO: Investigate sharing more of this with shuffle lowering.
26770 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26771 bool AllowFloatDomain, bool AllowIntDomain,
26772 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
26773 const X86Subtarget &Subtarget,
26774 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
26775 unsigned NumMaskElts = Mask.size();
26776 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
26778 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
26779 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
26780 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
26781 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
26782 unsigned MaxScale = 64 / MaskEltSize;
26783 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
26785 unsigned NumDstElts = NumMaskElts / Scale;
26786 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
26787 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
26788 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
26791 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
26792 SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
26793 if (SrcVT != MaskVT)
26794 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
26795 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
26796 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
26797 Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
26798 : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
26804 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
26805 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
26806 isUndefOrEqual(Mask[0], 0) &&
26807 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
26808 Shuffle = X86ISD::VZEXT_MOVL;
26809 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
26813 // Check if we have SSE3 which will let us use MOVDDUP etc. The
26814 // instructions are no slower than UNPCKLPD but has the option to
26815 // fold the input operand into even an unaligned memory load.
26816 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
26817 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
26818 Shuffle = X86ISD::MOVDDUP;
26819 SrcVT = DstVT = MVT::v2f64;
26822 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26823 Shuffle = X86ISD::MOVSLDUP;
26824 SrcVT = DstVT = MVT::v4f32;
26827 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
26828 Shuffle = X86ISD::MOVSHDUP;
26829 SrcVT = DstVT = MVT::v4f32;
26834 if (MaskVT.is256BitVector() && AllowFloatDomain) {
26835 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
26836 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26837 Shuffle = X86ISD::MOVDDUP;
26838 SrcVT = DstVT = MVT::v4f64;
26841 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26842 Shuffle = X86ISD::MOVSLDUP;
26843 SrcVT = DstVT = MVT::v8f32;
26846 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
26847 Shuffle = X86ISD::MOVSHDUP;
26848 SrcVT = DstVT = MVT::v8f32;
26853 if (MaskVT.is512BitVector() && AllowFloatDomain) {
26854 assert(Subtarget.hasAVX512() &&
26855 "AVX512 required for 512-bit vector shuffles");
26856 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26857 Shuffle = X86ISD::MOVDDUP;
26858 SrcVT = DstVT = MVT::v8f64;
26861 if (isTargetShuffleEquivalent(
26862 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
26863 Shuffle = X86ISD::MOVSLDUP;
26864 SrcVT = DstVT = MVT::v16f32;
26867 if (isTargetShuffleEquivalent(
26868 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
26869 Shuffle = X86ISD::MOVSHDUP;
26870 SrcVT = DstVT = MVT::v16f32;
26875 // Attempt to match against broadcast-from-vector.
26876 if (Subtarget.hasAVX2()) {
26877 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
26878 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
26879 SrcVT = DstVT = MaskVT;
26880 Shuffle = X86ISD::VBROADCAST;
26888 // Attempt to match a combined shuffle mask against supported unary immediate
26889 // permute instructions.
26890 // TODO: Investigate sharing more of this with shuffle lowering.
26891 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26892 bool AllowFloatDomain,
26893 bool AllowIntDomain,
26894 const X86Subtarget &Subtarget,
26895 unsigned &Shuffle, MVT &ShuffleVT,
26896 unsigned &PermuteImm) {
26897 unsigned NumMaskElts = Mask.size();
26899 bool ContainsZeros = false;
26900 APInt Zeroable(NumMaskElts, false);
26901 for (unsigned i = 0; i != NumMaskElts; ++i) {
26903 if (isUndefOrZero(M))
26904 Zeroable.setBit(i);
26905 ContainsZeros |= (M == SM_SentinelZero);
26908 // Attempt to match against byte/bit shifts.
26909 // FIXME: Add 512-bit support.
26910 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26911 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26912 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
26913 MaskVT.getScalarSizeInBits(), Mask,
26914 0, Zeroable, Subtarget);
26915 if (0 < ShiftAmt) {
26916 PermuteImm = (unsigned)ShiftAmt;
26921 // Ensure we don't contain any zero elements.
26925 assert(llvm::all_of(Mask, [&](int M) {
26926 return SM_SentinelUndef <= M && M < (int)NumMaskElts;
26927 }) && "Expected unary shuffle");
26929 unsigned InputSizeInBits = MaskVT.getSizeInBits();
26930 unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
26931 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
26933 // Handle PSHUFLW/PSHUFHW repeated patterns.
26934 if (MaskScalarSizeInBits == 16) {
26935 SmallVector<int, 4> RepeatedMask;
26936 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
26937 ArrayRef<int> LoMask(Mask.data() + 0, 4);
26938 ArrayRef<int> HiMask(Mask.data() + 4, 4);
26940 // PSHUFLW: permute lower 4 elements only.
26941 if (isUndefOrInRange(LoMask, 0, 4) &&
26942 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
26943 Shuffle = X86ISD::PSHUFLW;
26944 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26945 PermuteImm = getV4X86ShuffleImm(LoMask);
26949 // PSHUFHW: permute upper 4 elements only.
26950 if (isUndefOrInRange(HiMask, 4, 8) &&
26951 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
26952 // Offset the HiMask so that we can create the shuffle immediate.
26953 int OffsetHiMask[4];
26954 for (int i = 0; i != 4; ++i)
26955 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
26957 Shuffle = X86ISD::PSHUFHW;
26958 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26959 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
26968 // We only support permutation of 32/64 bit elements after this.
26969 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
26972 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
26973 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
26974 if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
26977 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
26978 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
26979 AllowFloatDomain = true;
26980 AllowIntDomain = false;
26983 // Check for lane crossing permutes.
26984 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
26985 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
26986 if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
26987 Shuffle = X86ISD::VPERMI;
26988 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
26989 PermuteImm = getV4X86ShuffleImm(Mask);
26992 if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
26993 SmallVector<int, 4> RepeatedMask;
26994 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
26995 Shuffle = X86ISD::VPERMI;
26996 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
26997 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27004 // VPERMILPD can permute with a non-repeating shuffle.
27005 if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
27006 Shuffle = X86ISD::VPERMILPI;
27007 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27009 for (int i = 0, e = Mask.size(); i != e; ++i) {
27011 if (M == SM_SentinelUndef)
27013 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27014 PermuteImm |= (M & 1) << i;
27019 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
27020 SmallVector<int, 4> RepeatedMask;
27021 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
27024 // Narrow the repeated mask for 32-bit element permutes.
27025 SmallVector<int, 4> WordMask = RepeatedMask;
27026 if (MaskScalarSizeInBits == 64)
27027 scaleShuffleMask(2, RepeatedMask, WordMask);
27029 Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
27030 ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32);
27031 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27032 PermuteImm = getV4X86ShuffleImm(WordMask);
27036 // Attempt to match a combined unary shuffle mask against supported binary
27037 // shuffle instructions.
27038 // TODO: Investigate sharing more of this with shuffle lowering.
27039 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27040 bool AllowFloatDomain, bool AllowIntDomain,
27041 SDValue &V1, SDValue &V2, SDLoc &DL,
27043 const X86Subtarget &Subtarget,
27044 unsigned &Shuffle, MVT &ShuffleVT,
27046 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27048 if (MaskVT.is128BitVector()) {
27049 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27051 Shuffle = X86ISD::MOVLHPS;
27052 ShuffleVT = MVT::v4f32;
27055 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27057 Shuffle = X86ISD::MOVHLPS;
27058 ShuffleVT = MVT::v4f32;
27061 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27062 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27064 Shuffle = X86ISD::MOVSD;
27065 ShuffleVT = MaskVT;
27068 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27069 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27070 Shuffle = X86ISD::MOVSS;
27071 ShuffleVT = MaskVT;
27076 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27077 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
27078 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27079 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
27080 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
27081 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27082 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27084 ShuffleVT = MaskVT;
27085 if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27086 ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27094 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27095 bool AllowFloatDomain,
27096 bool AllowIntDomain,
27097 SDValue &V1, SDValue &V2, SDLoc &DL,
27099 const X86Subtarget &Subtarget,
27100 unsigned &Shuffle, MVT &ShuffleVT,
27101 unsigned &PermuteImm) {
27102 unsigned NumMaskElts = Mask.size();
27103 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27105 // Attempt to match against PALIGNR byte rotate.
27106 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27107 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27108 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27109 if (0 < ByteRotation) {
27110 Shuffle = X86ISD::PALIGNR;
27111 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27112 PermuteImm = ByteRotation;
27117 // Attempt to combine to X86ISD::BLENDI.
27118 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27119 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27120 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27121 uint64_t BlendMask = 0;
27122 bool ForceV1Zero = false, ForceV2Zero = false;
27123 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27124 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27126 if (MaskVT == MVT::v16i16) {
27127 // We can only use v16i16 PBLENDW if the lanes are repeated.
27128 SmallVector<int, 8> RepeatedMask;
27129 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27131 assert(RepeatedMask.size() == 8 &&
27132 "Repeated mask size doesn't match!");
27134 for (int i = 0; i < 8; ++i)
27135 if (RepeatedMask[i] >= 8)
27136 PermuteImm |= 1 << i;
27137 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27138 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27139 Shuffle = X86ISD::BLENDI;
27140 ShuffleVT = MaskVT;
27144 // Determine a type compatible with X86ISD::BLENDI.
27145 ShuffleVT = MaskVT;
27146 if (Subtarget.hasAVX2()) {
27147 if (ShuffleVT == MVT::v4i64)
27148 ShuffleVT = MVT::v8i32;
27149 else if (ShuffleVT == MVT::v2i64)
27150 ShuffleVT = MVT::v4i32;
27152 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27153 ShuffleVT = MVT::v8i16;
27154 else if (ShuffleVT == MVT::v4i64)
27155 ShuffleVT = MVT::v4f64;
27156 else if (ShuffleVT == MVT::v8i32)
27157 ShuffleVT = MVT::v8f32;
27160 if (!ShuffleVT.isFloatingPoint()) {
27161 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27163 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27164 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27165 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27168 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27169 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27170 PermuteImm = (unsigned)BlendMask;
27171 Shuffle = X86ISD::BLENDI;
27177 // Attempt to combine to INSERTPS.
27178 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27179 MaskVT.is128BitVector()) {
27180 APInt Zeroable(4, 0);
27181 for (unsigned i = 0; i != NumMaskElts; ++i)
27183 Zeroable.setBit(i);
27185 if (Zeroable.getBoolValue() &&
27186 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27187 Shuffle = X86ISD::INSERTPS;
27188 ShuffleVT = MVT::v4f32;
27193 // Attempt to combine to SHUFPD.
27194 if (AllowFloatDomain && EltSizeInBits == 64 &&
27195 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27196 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27197 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27198 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27199 Shuffle = X86ISD::SHUFP;
27200 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27205 // Attempt to combine to SHUFPS.
27206 if (AllowFloatDomain && EltSizeInBits == 32 &&
27207 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
27208 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27209 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27210 SmallVector<int, 4> RepeatedMask;
27211 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27212 // Match each half of the repeated mask, to determine if its just
27213 // referencing one of the vectors, is zeroable or entirely undef.
27214 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27215 int M0 = RepeatedMask[Offset];
27216 int M1 = RepeatedMask[Offset + 1];
27218 if (isUndefInRange(RepeatedMask, Offset, 2)) {
27219 return DAG.getUNDEF(MaskVT);
27220 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27221 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27222 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27223 return getZeroVector(MaskVT, Subtarget, DAG, DL);
27224 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27225 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27226 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27228 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27229 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27230 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27237 int ShufMask[4] = {-1, -1, -1, -1};
27238 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27239 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27244 Shuffle = X86ISD::SHUFP;
27245 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27246 PermuteImm = getV4X86ShuffleImm(ShufMask);
27255 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
27258 /// This is the leaf of the recursive combine below. When we have found some
27259 /// chain of single-use x86 shuffle instructions and accumulated the combined
27260 /// shuffle mask represented by them, this will try to pattern match that mask
27261 /// into either a single instruction if there is a special purpose instruction
27262 /// for this operation, or into a PSHUFB instruction which is a fully general
27263 /// instruction but should only be used to replace chains over a certain depth.
27264 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27265 ArrayRef<int> BaseMask, int Depth,
27266 bool HasVariableMask, SelectionDAG &DAG,
27267 TargetLowering::DAGCombinerInfo &DCI,
27268 const X86Subtarget &Subtarget) {
27269 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27270 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27271 "Unexpected number of shuffle inputs!");
27273 // Find the inputs that enter the chain. Note that multiple uses are OK
27274 // here, we're not going to remove the operands we find.
27275 bool UnaryShuffle = (Inputs.size() == 1);
27276 SDValue V1 = peekThroughBitcasts(Inputs[0]);
27277 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27278 : peekThroughBitcasts(Inputs[1]));
27280 MVT VT1 = V1.getSimpleValueType();
27281 MVT VT2 = V2.getSimpleValueType();
27282 MVT RootVT = Root.getSimpleValueType();
27283 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27284 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27285 "Vector size mismatch");
27290 unsigned NumBaseMaskElts = BaseMask.size();
27291 if (NumBaseMaskElts == 1) {
27292 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27293 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27298 unsigned RootSizeInBits = RootVT.getSizeInBits();
27299 unsigned NumRootElts = RootVT.getVectorNumElements();
27300 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27301 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27302 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
27304 // Don't combine if we are a AVX512/EVEX target and the mask element size
27305 // is different from the root element size - this would prevent writemasks
27306 // from being reused.
27307 // TODO - this currently prevents all lane shuffles from occurring.
27308 // TODO - check for writemasks usage instead of always preventing combining.
27309 // TODO - attempt to narrow Mask back to writemask size.
27310 bool IsEVEXShuffle =
27311 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
27312 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27315 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27317 // Handle 128-bit lane shuffles of 256-bit vectors.
27318 // TODO - this should support binary shuffles.
27319 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27320 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27321 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27322 return false; // Nothing to do!
27323 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27324 unsigned PermMask = 0;
27325 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27326 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27328 Res = DAG.getBitcast(ShuffleVT, V1);
27329 DCI.AddToWorklist(Res.getNode());
27330 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27331 DAG.getUNDEF(ShuffleVT),
27332 DAG.getConstant(PermMask, DL, MVT::i8));
27333 DCI.AddToWorklist(Res.getNode());
27334 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27339 // For masks that have been widened to 128-bit elements or more,
27340 // narrow back down to 64-bit elements.
27341 SmallVector<int, 64> Mask;
27342 if (BaseMaskEltSizeInBits > 64) {
27343 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27344 int MaskScale = BaseMaskEltSizeInBits / 64;
27345 scaleShuffleMask(MaskScale, BaseMask, Mask);
27347 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27350 unsigned NumMaskElts = Mask.size();
27351 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27353 // Determine the effective mask value type.
27354 FloatDomain &= (32 <= MaskEltSizeInBits);
27355 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27356 : MVT::getIntegerVT(MaskEltSizeInBits);
27357 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27359 // Only allow legal mask types.
27360 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27363 // Attempt to match the mask against known shuffle patterns.
27364 MVT ShuffleSrcVT, ShuffleVT;
27365 unsigned Shuffle, PermuteImm;
27367 // Which shuffle domains are permitted?
27368 // Permit domain crossing at higher combine depths.
27369 bool AllowFloatDomain = FloatDomain || (Depth > 3);
27370 bool AllowIntDomain = !FloatDomain || (Depth > 3);
27372 if (UnaryShuffle) {
27373 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27374 // directly if we don't shuffle the lower element and we shuffle the upper
27375 // (zero) elements within themselves.
27376 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27377 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27378 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27379 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27380 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27381 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27382 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27388 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27389 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27391 if (Depth == 1 && Root.getOpcode() == Shuffle)
27392 return false; // Nothing to do!
27393 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27394 return false; // AVX512 Writemask clash.
27395 Res = DAG.getBitcast(ShuffleSrcVT, V1);
27396 DCI.AddToWorklist(Res.getNode());
27397 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27398 DCI.AddToWorklist(Res.getNode());
27399 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27404 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27405 AllowIntDomain, Subtarget, Shuffle,
27406 ShuffleVT, PermuteImm)) {
27407 if (Depth == 1 && Root.getOpcode() == Shuffle)
27408 return false; // Nothing to do!
27409 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27410 return false; // AVX512 Writemask clash.
27411 Res = DAG.getBitcast(ShuffleVT, V1);
27412 DCI.AddToWorklist(Res.getNode());
27413 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27414 DAG.getConstant(PermuteImm, DL, MVT::i8));
27415 DCI.AddToWorklist(Res.getNode());
27416 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27422 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27423 V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27425 if (Depth == 1 && Root.getOpcode() == Shuffle)
27426 return false; // Nothing to do!
27427 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27428 return false; // AVX512 Writemask clash.
27429 V1 = DAG.getBitcast(ShuffleVT, V1);
27430 DCI.AddToWorklist(V1.getNode());
27431 V2 = DAG.getBitcast(ShuffleVT, V2);
27432 DCI.AddToWorklist(V2.getNode());
27433 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27434 DCI.AddToWorklist(Res.getNode());
27435 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27440 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27441 AllowIntDomain, V1, V2, DL, DAG,
27442 Subtarget, Shuffle, ShuffleVT,
27444 if (Depth == 1 && Root.getOpcode() == Shuffle)
27445 return false; // Nothing to do!
27446 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27447 return false; // AVX512 Writemask clash.
27448 V1 = DAG.getBitcast(ShuffleVT, V1);
27449 DCI.AddToWorklist(V1.getNode());
27450 V2 = DAG.getBitcast(ShuffleVT, V2);
27451 DCI.AddToWorklist(V2.getNode());
27452 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27453 DAG.getConstant(PermuteImm, DL, MVT::i8));
27454 DCI.AddToWorklist(Res.getNode());
27455 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27460 // Don't try to re-form single instruction chains under any circumstances now
27461 // that we've done encoding canonicalization for them.
27465 bool MaskContainsZeros =
27466 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27468 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27469 // If we have a single input lane-crossing shuffle then lower to VPERMV.
27470 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27471 ((Subtarget.hasAVX2() &&
27472 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27473 (Subtarget.hasAVX512() &&
27474 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27475 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27476 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27477 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27478 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27479 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27480 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27481 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27482 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27483 DCI.AddToWorklist(VPermMask.getNode());
27484 Res = DAG.getBitcast(MaskVT, V1);
27485 DCI.AddToWorklist(Res.getNode());
27486 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27487 DCI.AddToWorklist(Res.getNode());
27488 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27493 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27494 // vector as the second source.
27495 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27496 ((Subtarget.hasAVX512() &&
27497 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27498 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27499 (Subtarget.hasVLX() &&
27500 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27501 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27502 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27503 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27504 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27505 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27506 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27507 for (unsigned i = 0; i != NumMaskElts; ++i)
27508 if (Mask[i] == SM_SentinelZero)
27509 Mask[i] = NumMaskElts + i;
27511 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27512 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27513 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27514 DCI.AddToWorklist(VPermMask.getNode());
27515 Res = DAG.getBitcast(MaskVT, V1);
27516 DCI.AddToWorklist(Res.getNode());
27517 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27518 DCI.AddToWorklist(Zero.getNode());
27519 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27520 DCI.AddToWorklist(Res.getNode());
27521 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27526 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27527 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27528 ((Subtarget.hasAVX512() &&
27529 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27530 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27531 (Subtarget.hasVLX() &&
27532 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27533 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27534 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27535 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27536 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27537 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27538 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27539 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27540 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27541 DCI.AddToWorklist(VPermMask.getNode());
27542 V1 = DAG.getBitcast(MaskVT, V1);
27543 DCI.AddToWorklist(V1.getNode());
27544 V2 = DAG.getBitcast(MaskVT, V2);
27545 DCI.AddToWorklist(V2.getNode());
27546 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27547 DCI.AddToWorklist(Res.getNode());
27548 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27555 // See if we can combine a single input shuffle with zeros to a bit-mask,
27556 // which is much simpler than any shuffle.
27557 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27558 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27559 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27560 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27561 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27562 APInt UndefElts(NumMaskElts, 0);
27563 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27564 for (unsigned i = 0; i != NumMaskElts; ++i) {
27566 if (M == SM_SentinelUndef) {
27567 UndefElts.setBit(i);
27570 if (M == SM_SentinelZero)
27572 EltBits[i] = AllOnes;
27574 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27575 DCI.AddToWorklist(BitMask.getNode());
27576 Res = DAG.getBitcast(MaskVT, V1);
27577 DCI.AddToWorklist(Res.getNode());
27578 unsigned AndOpcode =
27579 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27580 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27581 DCI.AddToWorklist(Res.getNode());
27582 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27587 // If we have a single input shuffle with different shuffle patterns in the
27588 // the 128-bit lanes use the variable mask to VPERMILPS.
27589 // TODO Combine other mask types at higher depths.
27590 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27591 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27592 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27593 SmallVector<SDValue, 16> VPermIdx;
27594 for (int M : Mask) {
27596 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27597 VPermIdx.push_back(Idx);
27599 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
27600 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
27601 DCI.AddToWorklist(VPermMask.getNode());
27602 Res = DAG.getBitcast(MaskVT, V1);
27603 DCI.AddToWorklist(Res.getNode());
27604 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27605 DCI.AddToWorklist(Res.getNode());
27606 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27611 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27612 // to VPERMIL2PD/VPERMIL2PS.
27613 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27614 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27615 MaskVT == MVT::v8f32)) {
27616 // VPERMIL2 Operation.
27617 // Bits[3] - Match Bit.
27618 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27619 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27620 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27621 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27622 SmallVector<int, 8> VPerm2Idx;
27623 MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
27624 MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
27625 unsigned M2ZImm = 0;
27626 for (int M : Mask) {
27627 if (M == SM_SentinelUndef) {
27628 VPerm2Idx.push_back(-1);
27631 if (M == SM_SentinelZero) {
27633 VPerm2Idx.push_back(8);
27636 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27637 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27638 VPerm2Idx.push_back(Index);
27640 V1 = DAG.getBitcast(MaskVT, V1);
27641 DCI.AddToWorklist(V1.getNode());
27642 V2 = DAG.getBitcast(MaskVT, V2);
27643 DCI.AddToWorklist(V2.getNode());
27644 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
27645 DCI.AddToWorklist(VPerm2MaskOp.getNode());
27646 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27647 DAG.getConstant(M2ZImm, DL, MVT::i8));
27648 DCI.AddToWorklist(Res.getNode());
27649 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27654 // If we have 3 or more shuffle instructions or a chain involving a variable
27655 // mask, we can replace them with a single PSHUFB instruction profitably.
27656 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
27657 // instructions, but in practice PSHUFB tends to be *very* fast so we're
27658 // more aggressive.
27659 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27660 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27661 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
27662 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
27663 SmallVector<SDValue, 16> PSHUFBMask;
27664 int NumBytes = RootVT.getSizeInBits() / 8;
27665 int Ratio = NumBytes / NumMaskElts;
27666 for (int i = 0; i < NumBytes; ++i) {
27667 int M = Mask[i / Ratio];
27668 if (M == SM_SentinelUndef) {
27669 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
27672 if (M == SM_SentinelZero) {
27673 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
27676 M = Ratio * M + i % Ratio;
27677 assert ((M / 16) == (i / 16) && "Lane crossing detected");
27678 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27680 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
27681 Res = DAG.getBitcast(ByteVT, V1);
27682 DCI.AddToWorklist(Res.getNode());
27683 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
27684 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
27685 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
27686 DCI.AddToWorklist(Res.getNode());
27687 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27692 // With XOP, if we have a 128-bit binary input shuffle we can always combine
27693 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
27694 // slower than PSHUFB on targets that support both.
27695 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
27696 Subtarget.hasXOP()) {
27697 // VPPERM Mask Operation
27698 // Bits[4:0] - Byte Index (0 - 31)
27699 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
27700 SmallVector<SDValue, 16> VPPERMMask;
27702 int Ratio = NumBytes / NumMaskElts;
27703 for (int i = 0; i < NumBytes; ++i) {
27704 int M = Mask[i / Ratio];
27705 if (M == SM_SentinelUndef) {
27706 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
27709 if (M == SM_SentinelZero) {
27710 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
27713 M = Ratio * M + i % Ratio;
27714 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27716 MVT ByteVT = MVT::v16i8;
27717 V1 = DAG.getBitcast(ByteVT, V1);
27718 DCI.AddToWorklist(V1.getNode());
27719 V2 = DAG.getBitcast(ByteVT, V2);
27720 DCI.AddToWorklist(V2.getNode());
27721 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
27722 DCI.AddToWorklist(VPPERMMaskOp.getNode());
27723 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
27724 DCI.AddToWorklist(Res.getNode());
27725 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27730 // Failed to find any combines.
27734 // Attempt to constant fold all of the constant source ops.
27735 // Returns true if the entire shuffle is folded to a constant.
27736 // TODO: Extend this to merge multiple constant Ops and update the mask.
27737 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
27738 ArrayRef<int> Mask, SDValue Root,
27739 bool HasVariableMask, SelectionDAG &DAG,
27740 TargetLowering::DAGCombinerInfo &DCI,
27741 const X86Subtarget &Subtarget) {
27742 MVT VT = Root.getSimpleValueType();
27744 unsigned SizeInBits = VT.getSizeInBits();
27745 unsigned NumMaskElts = Mask.size();
27746 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
27747 unsigned NumOps = Ops.size();
27749 // Extract constant bits from each source op.
27750 bool OneUseConstantOp = false;
27751 SmallVector<APInt, 16> UndefEltsOps(NumOps);
27752 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
27753 for (unsigned i = 0; i != NumOps; ++i) {
27754 SDValue SrcOp = Ops[i];
27755 OneUseConstantOp |= SrcOp.hasOneUse();
27756 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
27761 // Only fold if at least one of the constants is only used once or
27762 // the combined shuffle has included a variable mask shuffle, this
27763 // is to avoid constant pool bloat.
27764 if (!OneUseConstantOp && !HasVariableMask)
27767 // Shuffle the constant bits according to the mask.
27768 APInt UndefElts(NumMaskElts, 0);
27769 APInt ZeroElts(NumMaskElts, 0);
27770 APInt ConstantElts(NumMaskElts, 0);
27771 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
27772 APInt::getNullValue(MaskSizeInBits));
27773 for (unsigned i = 0; i != NumMaskElts; ++i) {
27775 if (M == SM_SentinelUndef) {
27776 UndefElts.setBit(i);
27778 } else if (M == SM_SentinelZero) {
27779 ZeroElts.setBit(i);
27782 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
27784 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
27785 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
27787 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
27788 if (SrcUndefElts[SrcMaskIdx]) {
27789 UndefElts.setBit(i);
27793 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
27794 APInt &Bits = SrcEltBits[SrcMaskIdx];
27796 ZeroElts.setBit(i);
27800 ConstantElts.setBit(i);
27801 ConstantBitData[i] = Bits;
27803 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
27805 // Create the constant data.
27807 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
27808 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
27810 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
27812 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
27815 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
27816 DCI.AddToWorklist(CstOp.getNode());
27817 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
27821 /// \brief Fully generic combining of x86 shuffle instructions.
27823 /// This should be the last combine run over the x86 shuffle instructions. Once
27824 /// they have been fully optimized, this will recursively consider all chains
27825 /// of single-use shuffle instructions, build a generic model of the cumulative
27826 /// shuffle operation, and check for simpler instructions which implement this
27827 /// operation. We use this primarily for two purposes:
27829 /// 1) Collapse generic shuffles to specialized single instructions when
27830 /// equivalent. In most cases, this is just an encoding size win, but
27831 /// sometimes we will collapse multiple generic shuffles into a single
27832 /// special-purpose shuffle.
27833 /// 2) Look for sequences of shuffle instructions with 3 or more total
27834 /// instructions, and replace them with the slightly more expensive SSSE3
27835 /// PSHUFB instruction if available. We do this as the last combining step
27836 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
27837 /// a suitable short sequence of other instructions. The PSHUFB will either
27838 /// use a register or have to read from memory and so is slightly (but only
27839 /// slightly) more expensive than the other shuffle instructions.
27841 /// Because this is inherently a quadratic operation (for each shuffle in
27842 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
27843 /// This should never be an issue in practice as the shuffle lowering doesn't
27844 /// produce sequences of more than 8 instructions.
27846 /// FIXME: We will currently miss some cases where the redundant shuffling
27847 /// would simplify under the threshold for PSHUFB formation because of
27848 /// combine-ordering. To fix this, we should do the redundant instruction
27849 /// combining in this recursive walk.
27850 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27851 int SrcOpIndex, SDValue Root,
27852 ArrayRef<int> RootMask,
27853 ArrayRef<const SDNode*> SrcNodes,
27854 int Depth, bool HasVariableMask,
27856 TargetLowering::DAGCombinerInfo &DCI,
27857 const X86Subtarget &Subtarget) {
27858 // Bound the depth of our recursive combine because this is ultimately
27859 // quadratic in nature.
27863 // Directly rip through bitcasts to find the underlying operand.
27864 SDValue Op = SrcOps[SrcOpIndex];
27865 Op = peekThroughOneUseBitcasts(Op);
27867 MVT VT = Op.getSimpleValueType();
27868 if (!VT.isVector())
27869 return false; // Bail if we hit a non-vector.
27871 assert(Root.getSimpleValueType().isVector() &&
27872 "Shuffles operate on vector types!");
27873 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
27874 "Can only combine shuffles of the same vector register size.");
27876 // Extract target shuffle mask and resolve sentinels and inputs.
27877 SmallVector<int, 64> OpMask;
27878 SmallVector<SDValue, 2> OpInputs;
27879 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask))
27882 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
27883 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
27884 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
27886 // Add the inputs to the Ops list, avoiding duplicates.
27887 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
27889 int InputIdx0 = -1, InputIdx1 = -1;
27890 for (int i = 0, e = Ops.size(); i < e; ++i) {
27891 SDValue BC = peekThroughBitcasts(Ops[i]);
27892 if (Input0 && BC == peekThroughBitcasts(Input0))
27894 if (Input1 && BC == peekThroughBitcasts(Input1))
27898 if (Input0 && InputIdx0 < 0) {
27899 InputIdx0 = SrcOpIndex;
27900 Ops[SrcOpIndex] = Input0;
27902 if (Input1 && InputIdx1 < 0) {
27903 InputIdx1 = Ops.size();
27904 Ops.push_back(Input1);
27907 assert(((RootMask.size() > OpMask.size() &&
27908 RootMask.size() % OpMask.size() == 0) ||
27909 (OpMask.size() > RootMask.size() &&
27910 OpMask.size() % RootMask.size() == 0) ||
27911 OpMask.size() == RootMask.size()) &&
27912 "The smaller number of elements must divide the larger.");
27913 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27914 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27915 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27916 assert(((RootRatio == 1 && OpRatio == 1) ||
27917 (RootRatio == 1) != (OpRatio == 1)) &&
27918 "Must not have a ratio for both incoming and op masks!");
27920 SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
27922 // Merge this shuffle operation's mask into our accumulated mask. Note that
27923 // this shuffle's mask will be the first applied to the input, followed by the
27924 // root mask to get us all the way to the root value arrangement. The reason
27925 // for this order is that we are recursing up the operation chain.
27926 for (int i = 0; i < MaskWidth; ++i) {
27927 int RootIdx = i / RootRatio;
27928 if (RootMask[RootIdx] < 0) {
27929 // This is a zero or undef lane, we're done.
27930 Mask[i] = RootMask[RootIdx];
27934 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
27936 // Just insert the scaled root mask value if it references an input other
27937 // than the SrcOp we're currently inserting.
27938 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
27939 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
27940 Mask[i] = RootMaskedIdx;
27944 RootMaskedIdx %= MaskWidth;
27946 int OpIdx = RootMaskedIdx / OpRatio;
27947 if (OpMask[OpIdx] < 0) {
27948 // The incoming lanes are zero or undef, it doesn't matter which ones we
27950 Mask[i] = OpMask[OpIdx];
27954 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
27955 int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
27956 OpMaskedIdx %= MaskWidth;
27958 if (OpMask[OpIdx] < (int)OpMask.size()) {
27959 assert(0 <= InputIdx0 && "Unknown target shuffle input");
27960 OpMaskedIdx += InputIdx0 * MaskWidth;
27962 assert(0 <= InputIdx1 && "Unknown target shuffle input");
27963 OpMaskedIdx += InputIdx1 * MaskWidth;
27966 Mask[i] = OpMaskedIdx;
27969 // Handle the all undef/zero cases early.
27970 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
27971 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
27974 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
27975 // TODO - should we handle the mixed zero/undef case as well? Just returning
27976 // a zero mask will lose information on undef elements possibly reducing
27977 // future combine possibilities.
27978 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
27979 Subtarget, DAG, SDLoc(Root)));
27983 // Remove unused shuffle source ops.
27984 resolveTargetShuffleInputsAndMask(Ops, Mask);
27985 assert(!Ops.empty() && "Shuffle with no inputs detected");
27987 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
27989 // Update the list of shuffle nodes that have been combined so far.
27990 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
27992 CombinedNodes.push_back(Op.getNode());
27994 // See if we can recurse into each shuffle source op (if it's a target
27995 // shuffle). The source op should only be combined if it either has a
27996 // single use (i.e. current Op) or all its users have already been combined.
27997 for (int i = 0, e = Ops.size(); i < e; ++i)
27998 if (Ops[i].getNode()->hasOneUse() ||
27999 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28000 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28001 Depth + 1, HasVariableMask, DAG, DCI,
28005 // Attempt to constant fold all of the constant source ops.
28006 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
28010 // We can only combine unary and binary shuffle mask cases.
28011 if (Ops.size() > 2)
28014 // Minor canonicalization of the accumulated shuffle mask to make it easier
28015 // to match below. All this does is detect masks with sequential pairs of
28016 // elements, and shrink them to the half-width mask. It does this in a loop
28017 // so it will reduce the size of the mask to the minimal width mask which
28018 // performs an equivalent shuffle.
28019 SmallVector<int, 64> WidenedMask;
28020 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
28021 Mask = std::move(WidenedMask);
28024 // Canonicalization of binary shuffle masks to improve pattern matching by
28025 // commuting the inputs.
28026 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
28027 ShuffleVectorSDNode::commuteMask(Mask);
28028 std::swap(Ops[0], Ops[1]);
28031 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
28035 /// \brief Get the PSHUF-style mask from PSHUF node.
28037 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28038 /// PSHUF-style masks that can be reused with such instructions.
28039 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28040 MVT VT = N.getSimpleValueType();
28041 SmallVector<int, 4> Mask;
28042 SmallVector<SDValue, 2> Ops;
28045 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28049 // If we have more than 128-bits, only the low 128-bits of shuffle mask
28050 // matter. Check that the upper masks are repeats and remove them.
28051 if (VT.getSizeInBits() > 128) {
28052 int LaneElts = 128 / VT.getScalarSizeInBits();
28054 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28055 for (int j = 0; j < LaneElts; ++j)
28056 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28057 "Mask doesn't repeat in high 128-bit lanes!");
28059 Mask.resize(LaneElts);
28062 switch (N.getOpcode()) {
28063 case X86ISD::PSHUFD:
28065 case X86ISD::PSHUFLW:
28068 case X86ISD::PSHUFHW:
28069 Mask.erase(Mask.begin(), Mask.begin() + 4);
28070 for (int &M : Mask)
28074 llvm_unreachable("No valid shuffle instruction found!");
28078 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
28080 /// We walk up the chain and look for a combinable shuffle, skipping over
28081 /// shuffles that we could hoist this shuffle's transformation past without
28082 /// altering anything.
28084 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28085 SelectionDAG &DAG) {
28086 assert(N.getOpcode() == X86ISD::PSHUFD &&
28087 "Called with something other than an x86 128-bit half shuffle!");
28090 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28091 // of the shuffles in the chain so that we can form a fresh chain to replace
28093 SmallVector<SDValue, 8> Chain;
28094 SDValue V = N.getOperand(0);
28095 for (; V.hasOneUse(); V = V.getOperand(0)) {
28096 switch (V.getOpcode()) {
28098 return SDValue(); // Nothing combined!
28101 // Skip bitcasts as we always know the type for the target specific
28105 case X86ISD::PSHUFD:
28106 // Found another dword shuffle.
28109 case X86ISD::PSHUFLW:
28110 // Check that the low words (being shuffled) are the identity in the
28111 // dword shuffle, and the high words are self-contained.
28112 if (Mask[0] != 0 || Mask[1] != 1 ||
28113 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28116 Chain.push_back(V);
28119 case X86ISD::PSHUFHW:
28120 // Check that the high words (being shuffled) are the identity in the
28121 // dword shuffle, and the low words are self-contained.
28122 if (Mask[2] != 2 || Mask[3] != 3 ||
28123 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28126 Chain.push_back(V);
28129 case X86ISD::UNPCKL:
28130 case X86ISD::UNPCKH:
28131 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28132 // shuffle into a preceding word shuffle.
28133 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28134 V.getSimpleValueType().getVectorElementType() != MVT::i16)
28137 // Search for a half-shuffle which we can combine with.
28138 unsigned CombineOp =
28139 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28140 if (V.getOperand(0) != V.getOperand(1) ||
28141 !V->isOnlyUserOf(V.getOperand(0).getNode()))
28143 Chain.push_back(V);
28144 V = V.getOperand(0);
28146 switch (V.getOpcode()) {
28148 return SDValue(); // Nothing to combine.
28150 case X86ISD::PSHUFLW:
28151 case X86ISD::PSHUFHW:
28152 if (V.getOpcode() == CombineOp)
28155 Chain.push_back(V);
28159 V = V.getOperand(0);
28163 } while (V.hasOneUse());
28166 // Break out of the loop if we break out of the switch.
28170 if (!V.hasOneUse())
28171 // We fell out of the loop without finding a viable combining instruction.
28174 // Merge this node's mask and our incoming mask.
28175 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28176 for (int &M : Mask)
28178 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28179 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28181 // Rebuild the chain around this new shuffle.
28182 while (!Chain.empty()) {
28183 SDValue W = Chain.pop_back_val();
28185 if (V.getValueType() != W.getOperand(0).getValueType())
28186 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28188 switch (W.getOpcode()) {
28190 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28192 case X86ISD::UNPCKL:
28193 case X86ISD::UNPCKH:
28194 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28197 case X86ISD::PSHUFD:
28198 case X86ISD::PSHUFLW:
28199 case X86ISD::PSHUFHW:
28200 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28204 if (V.getValueType() != N.getValueType())
28205 V = DAG.getBitcast(N.getValueType(), V);
28207 // Return the new chain to replace N.
28211 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28214 /// We walk up the chain, skipping shuffles of the other half and looking
28215 /// through shuffles which switch halves trying to find a shuffle of the same
28216 /// pair of dwords.
28217 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28219 TargetLowering::DAGCombinerInfo &DCI) {
28221 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28222 "Called with something other than an x86 128-bit half shuffle!");
28224 unsigned CombineOpcode = N.getOpcode();
28226 // Walk up a single-use chain looking for a combinable shuffle.
28227 SDValue V = N.getOperand(0);
28228 for (; V.hasOneUse(); V = V.getOperand(0)) {
28229 switch (V.getOpcode()) {
28231 return false; // Nothing combined!
28234 // Skip bitcasts as we always know the type for the target specific
28238 case X86ISD::PSHUFLW:
28239 case X86ISD::PSHUFHW:
28240 if (V.getOpcode() == CombineOpcode)
28243 // Other-half shuffles are no-ops.
28246 // Break out of the loop if we break out of the switch.
28250 if (!V.hasOneUse())
28251 // We fell out of the loop without finding a viable combining instruction.
28254 // Combine away the bottom node as its shuffle will be accumulated into
28255 // a preceding shuffle.
28256 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28258 // Record the old value.
28261 // Merge this node's mask and our incoming mask (adjusted to account for all
28262 // the pshufd instructions encountered).
28263 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28264 for (int &M : Mask)
28266 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28267 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28269 // Check that the shuffles didn't cancel each other out. If not, we need to
28270 // combine to the new one.
28272 // Replace the combinable shuffle with the combined one, updating all users
28273 // so that we re-evaluate the chain here.
28274 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28279 /// \brief Try to combine x86 target specific shuffles.
28280 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28281 TargetLowering::DAGCombinerInfo &DCI,
28282 const X86Subtarget &Subtarget) {
28284 MVT VT = N.getSimpleValueType();
28285 SmallVector<int, 4> Mask;
28287 unsigned Opcode = N.getOpcode();
28289 case X86ISD::PSHUFD:
28290 case X86ISD::PSHUFLW:
28291 case X86ISD::PSHUFHW:
28292 Mask = getPSHUFShuffleMask(N);
28293 assert(Mask.size() == 4);
28295 case X86ISD::UNPCKL: {
28296 auto Op0 = N.getOperand(0);
28297 auto Op1 = N.getOperand(1);
28298 unsigned Opcode0 = Op0.getOpcode();
28299 unsigned Opcode1 = Op1.getOpcode();
28301 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28302 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28303 // TODO: Add other horizontal operations as required.
28304 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28305 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28307 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28308 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28309 // moves upper half elements into the lower half part. For example:
28311 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28313 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28315 // will be combined to:
28317 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28319 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28320 // happen due to advanced instructions.
28321 if (!VT.is128BitVector())
28324 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28325 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28327 unsigned NumElts = VT.getVectorNumElements();
28328 SmallVector<int, 8> ExpectedMask(NumElts, -1);
28329 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28332 auto ShufOp = Op1.getOperand(0);
28333 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28334 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28338 case X86ISD::BLENDI: {
28339 SDValue V0 = N->getOperand(0);
28340 SDValue V1 = N->getOperand(1);
28341 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28342 "Unexpected input vector types");
28344 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28345 // operands and changing the mask to 1. This saves us a bunch of
28346 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28347 // x86InstrInfo knows how to commute this back after instruction selection
28348 // if it would help register allocation.
28350 // TODO: If optimizing for size or a processor that doesn't suffer from
28351 // partial register update stalls, this should be transformed into a MOVSD
28352 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28354 if (VT == MVT::v2f64)
28355 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28356 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28357 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28358 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28363 case X86ISD::MOVSD:
28364 case X86ISD::MOVSS: {
28365 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28366 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28367 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28368 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28369 if (isZero0 && isZero1)
28372 // We often lower to MOVSD/MOVSS from integer as well as native float
28373 // types; remove unnecessary domain-crossing bitcasts if we can to make it
28374 // easier to combine shuffles later on. We've already accounted for the
28375 // domain switching cost when we decided to lower with it.
28376 bool isFloat = VT.isFloatingPoint();
28377 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28378 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28379 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
28380 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28381 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28382 V0 = DAG.getBitcast(NewVT, V0);
28383 V1 = DAG.getBitcast(NewVT, V1);
28384 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28389 case X86ISD::INSERTPS: {
28390 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28391 SDValue Op0 = N.getOperand(0);
28392 SDValue Op1 = N.getOperand(1);
28393 SDValue Op2 = N.getOperand(2);
28394 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28395 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28396 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28397 unsigned ZeroMask = InsertPSMask & 0xF;
28399 // If we zero out all elements from Op0 then we don't need to reference it.
28400 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28401 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28402 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28404 // If we zero out the element from Op1 then we don't need to reference it.
28405 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28406 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28407 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28409 // Attempt to merge insertps Op1 with an inner target shuffle node.
28410 SmallVector<int, 8> TargetMask1;
28411 SmallVector<SDValue, 2> Ops1;
28412 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28413 int M = TargetMask1[SrcIdx];
28414 if (isUndefOrZero(M)) {
28415 // Zero/UNDEF insertion - zero out element and remove dependency.
28416 InsertPSMask |= (1u << DstIdx);
28417 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28418 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28420 // Update insertps mask srcidx and reference the source input directly.
28421 assert(0 <= M && M < 8 && "Shuffle index out of range");
28422 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28423 Op1 = Ops1[M < 4 ? 0 : 1];
28424 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28425 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28428 // Attempt to merge insertps Op0 with an inner target shuffle node.
28429 SmallVector<int, 8> TargetMask0;
28430 SmallVector<SDValue, 2> Ops0;
28431 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28434 bool Updated = false;
28435 bool UseInput00 = false;
28436 bool UseInput01 = false;
28437 for (int i = 0; i != 4; ++i) {
28438 int M = TargetMask0[i];
28439 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
28440 // No change if element is already zero or the inserted element.
28442 } else if (isUndefOrZero(M)) {
28443 // If the target mask is undef/zero then we must zero the element.
28444 InsertPSMask |= (1u << i);
28449 // The input vector element must be inline.
28450 if (M != i && M != (i + 4))
28453 // Determine which inputs of the target shuffle we're using.
28454 UseInput00 |= (0 <= M && M < 4);
28455 UseInput01 |= (4 <= M);
28458 // If we're not using both inputs of the target shuffle then use the
28459 // referenced input directly.
28460 if (UseInput00 && !UseInput01) {
28463 } else if (!UseInput00 && UseInput01) {
28469 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28470 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28478 // Nuke no-op shuffles that show up after combining.
28479 if (isNoopShuffleMask(Mask))
28480 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28482 // Look for simplifications involving one or two shuffle instructions.
28483 SDValue V = N.getOperand(0);
28484 switch (N.getOpcode()) {
28487 case X86ISD::PSHUFLW:
28488 case X86ISD::PSHUFHW:
28489 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28491 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28492 return SDValue(); // We combined away this shuffle, so we're done.
28494 // See if this reduces to a PSHUFD which is no more expensive and can
28495 // combine with more operations. Note that it has to at least flip the
28496 // dwords as otherwise it would have been removed as a no-op.
28497 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28498 int DMask[] = {0, 1, 2, 3};
28499 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28500 DMask[DOffset + 0] = DOffset + 1;
28501 DMask[DOffset + 1] = DOffset + 0;
28502 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28503 V = DAG.getBitcast(DVT, V);
28504 DCI.AddToWorklist(V.getNode());
28505 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28506 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28507 DCI.AddToWorklist(V.getNode());
28508 return DAG.getBitcast(VT, V);
28511 // Look for shuffle patterns which can be implemented as a single unpack.
28512 // FIXME: This doesn't handle the location of the PSHUFD generically, and
28513 // only works when we have a PSHUFD followed by two half-shuffles.
28514 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28515 (V.getOpcode() == X86ISD::PSHUFLW ||
28516 V.getOpcode() == X86ISD::PSHUFHW) &&
28517 V.getOpcode() != N.getOpcode() &&
28519 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28520 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28521 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28522 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28523 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28524 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28526 for (int i = 0; i < 4; ++i) {
28527 WordMask[i + NOffset] = Mask[i] + NOffset;
28528 WordMask[i + VOffset] = VMask[i] + VOffset;
28530 // Map the word mask through the DWord mask.
28532 for (int i = 0; i < 8; ++i)
28533 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28534 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28535 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28536 // We can replace all three shuffles with an unpack.
28537 V = DAG.getBitcast(VT, D.getOperand(0));
28538 DCI.AddToWorklist(V.getNode());
28539 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28548 case X86ISD::PSHUFD:
28549 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28558 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28559 /// operation. If true is returned then the operands of ADDSUB operation
28560 /// are written to the parameters \p Opnd0 and \p Opnd1.
28562 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28563 /// so it is easier to generically match. We also insert dummy vector shuffle
28564 /// nodes for the operands which explicitly discard the lanes which are unused
28565 /// by this operation to try to flow through the rest of the combiner
28566 /// the fact that they're unused.
28567 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28568 SDValue &Opnd0, SDValue &Opnd1) {
28570 EVT VT = N->getValueType(0);
28571 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28572 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28573 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28576 // We only handle target-independent shuffles.
28577 // FIXME: It would be easy and harmless to use the target shuffle mask
28578 // extraction tool to support more.
28579 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28582 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28583 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28585 SDValue V1 = N->getOperand(0);
28586 SDValue V2 = N->getOperand(1);
28588 // We require the first shuffle operand to be the FSUB node, and the second to
28589 // be the FADD node.
28590 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28591 ShuffleVectorSDNode::commuteMask(Mask);
28593 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28596 // If there are other uses of these operations we can't fold them.
28597 if (!V1->hasOneUse() || !V2->hasOneUse())
28600 // Ensure that both operations have the same operands. Note that we can
28601 // commute the FADD operands.
28602 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28603 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28604 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28607 // We're looking for blends between FADD and FSUB nodes. We insist on these
28608 // nodes being lined up in a specific expected pattern.
28609 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28610 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28611 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28612 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28613 8, 25, 10, 27, 12, 29, 14, 31})))
28621 /// \brief Try to combine a shuffle into a target-specific add-sub or
28622 /// mul-add-sub node.
28623 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28624 const X86Subtarget &Subtarget,
28625 SelectionDAG &DAG) {
28626 SDValue Opnd0, Opnd1;
28627 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28630 EVT VT = N->getValueType(0);
28633 // Try to generate X86ISD::FMADDSUB node here.
28635 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28636 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28638 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
28639 // the ADDSUB idiom has been successfully recognized. There are no known
28640 // X86 targets with 512-bit ADDSUB instructions!
28641 if (VT.is512BitVector())
28644 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
28647 // We are looking for a shuffle where both sources are concatenated with undef
28648 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
28649 // if we can express this as a single-source shuffle, that's preferable.
28650 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
28651 const X86Subtarget &Subtarget) {
28652 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
28655 EVT VT = N->getValueType(0);
28657 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
28658 if (!VT.is128BitVector() && !VT.is256BitVector())
28661 if (VT.getVectorElementType() != MVT::i32 &&
28662 VT.getVectorElementType() != MVT::i64 &&
28663 VT.getVectorElementType() != MVT::f32 &&
28664 VT.getVectorElementType() != MVT::f64)
28667 SDValue N0 = N->getOperand(0);
28668 SDValue N1 = N->getOperand(1);
28670 // Check that both sources are concats with undef.
28671 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
28672 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
28673 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
28674 !N1.getOperand(1).isUndef())
28677 // Construct the new shuffle mask. Elements from the first source retain their
28678 // index, but elements from the second source no longer need to skip an undef.
28679 SmallVector<int, 8> Mask;
28680 int NumElts = VT.getVectorNumElements();
28682 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28683 for (int Elt : SVOp->getMask())
28684 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
28687 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
28689 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
28692 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
28693 TargetLowering::DAGCombinerInfo &DCI,
28694 const X86Subtarget &Subtarget) {
28696 EVT VT = N->getValueType(0);
28697 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28698 // If we have legalized the vector types, look for blends of FADD and FSUB
28699 // nodes that we can fuse into an ADDSUB node.
28700 if (TLI.isTypeLegal(VT))
28701 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
28704 // During Type Legalization, when promoting illegal vector types,
28705 // the backend might introduce new shuffle dag nodes and bitcasts.
28707 // This code performs the following transformation:
28708 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
28709 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
28711 // We do this only if both the bitcast and the BINOP dag nodes have
28712 // one use. Also, perform this transformation only if the new binary
28713 // operation is legal. This is to avoid introducing dag nodes that
28714 // potentially need to be further expanded (or custom lowered) into a
28715 // less optimal sequence of dag nodes.
28716 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
28717 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
28718 N->getOperand(0).getOpcode() == ISD::BITCAST &&
28719 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
28720 SDValue N0 = N->getOperand(0);
28721 SDValue N1 = N->getOperand(1);
28723 SDValue BC0 = N0.getOperand(0);
28724 EVT SVT = BC0.getValueType();
28725 unsigned Opcode = BC0.getOpcode();
28726 unsigned NumElts = VT.getVectorNumElements();
28728 if (BC0.hasOneUse() && SVT.isVector() &&
28729 SVT.getVectorNumElements() * 2 == NumElts &&
28730 TLI.isOperationLegal(Opcode, VT)) {
28731 bool CanFold = false;
28737 // isOperationLegal lies for integer ops on floating point types.
28738 CanFold = VT.isInteger();
28743 // isOperationLegal lies for floating point ops on integer types.
28744 CanFold = VT.isFloatingPoint();
28748 unsigned SVTNumElts = SVT.getVectorNumElements();
28749 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28750 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
28751 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
28752 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
28753 CanFold = SVOp->getMaskElt(i) < 0;
28756 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
28757 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
28758 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
28759 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
28764 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
28765 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
28766 // consecutive, non-overlapping, and in the right order.
28767 SmallVector<SDValue, 16> Elts;
28768 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
28769 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
28770 Elts.push_back(Elt);
28777 if (Elts.size() == VT.getVectorNumElements())
28778 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
28781 // For AVX2, we sometimes want to combine
28782 // (vector_shuffle <mask> (concat_vectors t1, undef)
28783 // (concat_vectors t2, undef))
28785 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
28786 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
28787 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
28790 if (isTargetShuffle(N->getOpcode())) {
28792 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
28795 // Try recursively combining arbitrary sequences of x86 shuffle
28796 // instructions into higher-order shuffles. We do this after combining
28797 // specific PSHUF instruction sequences into their minimal form so that we
28798 // can evaluate how many specialized shuffle instructions are involved in
28799 // a particular chain.
28800 SmallVector<int, 1> NonceMask; // Just a placeholder.
28801 NonceMask.push_back(0);
28802 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
28803 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
28805 return SDValue(); // This routine will use CombineTo to replace N.
28811 /// Check if a vector extract from a target-specific shuffle of a load can be
28812 /// folded into a single element load.
28813 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
28814 /// shuffles have been custom lowered so we need to handle those here.
28815 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
28816 TargetLowering::DAGCombinerInfo &DCI) {
28817 if (DCI.isBeforeLegalizeOps())
28820 SDValue InVec = N->getOperand(0);
28821 SDValue EltNo = N->getOperand(1);
28822 EVT EltVT = N->getValueType(0);
28824 if (!isa<ConstantSDNode>(EltNo))
28827 EVT OriginalVT = InVec.getValueType();
28829 // Peek through bitcasts, don't duplicate a load with other uses.
28830 InVec = peekThroughOneUseBitcasts(InVec);
28832 EVT CurrentVT = InVec.getValueType();
28833 if (!CurrentVT.isVector() ||
28834 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
28837 if (!isTargetShuffle(InVec.getOpcode()))
28840 // Don't duplicate a load with other uses.
28841 if (!InVec.hasOneUse())
28844 SmallVector<int, 16> ShuffleMask;
28845 SmallVector<SDValue, 2> ShuffleOps;
28847 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
28848 ShuffleOps, ShuffleMask, UnaryShuffle))
28851 // Select the input vector, guarding against out of range extract vector.
28852 unsigned NumElems = CurrentVT.getVectorNumElements();
28853 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
28854 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
28856 if (Idx == SM_SentinelZero)
28857 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
28858 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
28859 if (Idx == SM_SentinelUndef)
28860 return DAG.getUNDEF(EltVT);
28862 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
28863 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
28866 // If inputs to shuffle are the same for both ops, then allow 2 uses
28867 unsigned AllowedUses =
28868 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
28870 if (LdNode.getOpcode() == ISD::BITCAST) {
28871 // Don't duplicate a load with other uses.
28872 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
28875 AllowedUses = 1; // only allow 1 load use if we have a bitcast
28876 LdNode = LdNode.getOperand(0);
28879 if (!ISD::isNormalLoad(LdNode.getNode()))
28882 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
28884 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
28887 // If there's a bitcast before the shuffle, check if the load type and
28888 // alignment is valid.
28889 unsigned Align = LN0->getAlignment();
28890 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28891 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
28892 EltVT.getTypeForEVT(*DAG.getContext()));
28894 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
28897 // All checks match so transform back to vector_shuffle so that DAG combiner
28898 // can finish the job
28901 // Create shuffle node taking into account the case that its a unary shuffle
28902 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
28903 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
28905 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
28906 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
28910 // Try to match patterns such as
28911 // (i16 bitcast (v16i1 x))
28913 // (i16 movmsk (16i8 sext (v16i1 x)))
28914 // before the illegal vector is scalarized on subtargets that don't have legal
28916 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
28917 const X86Subtarget &Subtarget) {
28918 EVT VT = BitCast.getValueType();
28919 SDValue N0 = BitCast.getOperand(0);
28920 EVT VecVT = N0->getValueType(0);
28922 if (!VT.isScalarInteger() || !VecVT.isSimple())
28925 // With AVX512 vxi1 types are legal and we prefer using k-regs.
28926 // MOVMSK is supported in SSE2 or later.
28927 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
28930 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
28931 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
28932 // v8i16 and v16i16.
28933 // For these two cases, we can shuffle the upper element bytes to a
28934 // consecutive sequence at the start of the vector and treat the results as
28935 // v16i8 or v32i8, and for v61i8 this is the prefferable solution. However,
28936 // for v16i16 this is not the case, because the shuffle is expensive, so we
28937 // avoid sign-exteding to this type entirely.
28938 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
28939 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
28941 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
28942 switch (VecVT.getSimpleVT().SimpleTy) {
28946 SExtVT = MVT::v2i64;
28947 FPCastVT = MVT::v2f64;
28950 SExtVT = MVT::v4i32;
28951 FPCastVT = MVT::v4f32;
28952 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
28953 // sign-extend to a 256-bit operation to avoid truncation.
28954 if (N0->getOpcode() == ISD::SETCC &&
28955 N0->getOperand(0)->getValueType(0).is256BitVector() &&
28956 Subtarget.hasInt256()) {
28957 SExtVT = MVT::v4i64;
28958 FPCastVT = MVT::v4f64;
28962 SExtVT = MVT::v8i16;
28963 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
28964 // sign-extend to a 256-bit operation to match the compare.
28965 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
28966 // 256-bit because the shuffle is cheaper than sign extending the result of
28968 if (N0->getOpcode() == ISD::SETCC &&
28969 N0->getOperand(0)->getValueType(0).is256BitVector() &&
28970 Subtarget.hasInt256()) {
28971 SExtVT = MVT::v8i32;
28972 FPCastVT = MVT::v8f32;
28976 SExtVT = MVT::v16i8;
28977 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
28978 // it is not profitable to sign-extend to 256-bit because this will
28979 // require an extra cross-lane shuffle which is more exprensive than
28980 // truncating the result of the compare to 128-bits.
28983 // TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
28984 if (!Subtarget.hasInt256())
28986 SExtVT = MVT::v32i8;
28991 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
28992 if (SExtVT == MVT::v8i16) {
28993 V = DAG.getBitcast(MVT::v16i8, V);
28994 V = DAG.getVectorShuffle(
28995 MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
28996 {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
28998 assert(SExtVT.getScalarType() != MVT::i16 &&
28999 "Vectors of i16 must be shuffled");
29000 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
29001 V = DAG.getBitcast(FPCastVT, V);
29002 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29003 return DAG.getZExtOrTrunc(V, DL, VT);
29006 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
29007 TargetLowering::DAGCombinerInfo &DCI,
29008 const X86Subtarget &Subtarget) {
29009 SDValue N0 = N->getOperand(0);
29010 EVT VT = N->getValueType(0);
29011 EVT SrcVT = N0.getValueType();
29013 // Try to match patterns such as
29014 // (i16 bitcast (v16i1 x))
29016 // (i16 movmsk (16i8 sext (v16i1 x)))
29017 // before the setcc result is scalarized on subtargets that don't have legal
29019 if (DCI.isBeforeLegalize())
29020 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
29022 // Since MMX types are special and don't usually play with other vector types,
29023 // it's better to handle them early to be sure we emit efficient code by
29024 // avoiding store-load conversions.
29026 // Detect bitcasts between i32 to x86mmx low word.
29027 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
29028 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
29029 SDValue N00 = N0->getOperand(0);
29030 if (N00.getValueType() == MVT::i32)
29031 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
29034 // Detect bitcasts between element or subvector extraction to x86mmx.
29035 if (VT == MVT::x86mmx &&
29036 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
29037 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
29038 isNullConstant(N0.getOperand(1))) {
29039 SDValue N00 = N0->getOperand(0);
29040 if (N00.getValueType().is128BitVector())
29041 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
29042 DAG.getBitcast(MVT::v2i64, N00));
29045 // Detect bitcasts from FP_TO_SINT to x86mmx.
29046 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
29047 N0.getOpcode() == ISD::FP_TO_SINT) {
29049 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
29050 DAG.getUNDEF(MVT::v2i32));
29051 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
29052 DAG.getBitcast(MVT::v2i64, Res));
29055 // Convert a bitcasted integer logic operation that has one bitcasted
29056 // floating-point operand into a floating-point logic operation. This may
29057 // create a load of a constant, but that is cheaper than materializing the
29058 // constant in an integer register and transferring it to an SSE register or
29059 // transferring the SSE operand to integer register and back.
29061 switch (N0.getOpcode()) {
29062 case ISD::AND: FPOpcode = X86ISD::FAND; break;
29063 case ISD::OR: FPOpcode = X86ISD::FOR; break;
29064 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
29065 default: return SDValue();
29068 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
29069 (Subtarget.hasSSE2() && VT == MVT::f64)))
29072 SDValue LogicOp0 = N0.getOperand(0);
29073 SDValue LogicOp1 = N0.getOperand(1);
29076 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
29077 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
29078 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
29079 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
29080 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29081 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29083 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29084 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
29085 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
29086 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
29087 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29088 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29094 // Match a binop + shuffle pyramid that represents a horizontal reduction over
29095 // the elements of a vector.
29096 // Returns the vector that is being reduced on, or SDValue() if a reduction
29097 // was not matched.
29098 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29099 // The pattern must end in an extract from index 0.
29100 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
29101 !isNullConstant(Extract->getOperand(1)))
29105 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29107 SDValue Op = Extract->getOperand(0);
29108 // At each stage, we're looking for something that looks like:
29109 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29110 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29111 // i32 undef, i32 undef, i32 undef, i32 undef>
29112 // %a = binop <8 x i32> %op, %s
29113 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29114 // we expect something like:
29115 // <4,5,6,7,u,u,u,u>
29116 // <2,3,u,u,u,u,u,u>
29117 // <1,u,u,u,u,u,u,u>
29118 for (unsigned i = 0; i < Stages; ++i) {
29119 if (Op.getOpcode() != BinOp)
29122 ShuffleVectorSDNode *Shuffle =
29123 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29125 Op = Op.getOperand(1);
29127 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29128 Op = Op.getOperand(0);
29131 // The first operand of the shuffle should be the same as the other operand
29133 if (!Shuffle || (Shuffle->getOperand(0) != Op))
29136 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
29137 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
29138 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29145 // Given a select, detect the following pattern:
29146 // 1: %2 = zext <N x i8> %0 to <N x i32>
29147 // 2: %3 = zext <N x i8> %1 to <N x i32>
29148 // 3: %4 = sub nsw <N x i32> %2, %3
29149 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29150 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
29151 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29152 // This is useful as it is the input into a SAD pattern.
29153 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29155 // Check the condition of the select instruction is greater-than.
29156 SDValue SetCC = Select->getOperand(0);
29157 if (SetCC.getOpcode() != ISD::SETCC)
29159 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29160 if (CC != ISD::SETGT && CC != ISD::SETLT)
29163 SDValue SelectOp1 = Select->getOperand(1);
29164 SDValue SelectOp2 = Select->getOperand(2);
29166 // The following instructions assume SelectOp1 is the subtraction operand
29167 // and SelectOp2 is the negation operand.
29168 // In the case of SETLT this is the other way around.
29169 if (CC == ISD::SETLT)
29170 std::swap(SelectOp1, SelectOp2);
29172 // The second operand of the select should be the negation of the first
29173 // operand, which is implemented as 0 - SelectOp1.
29174 if (!(SelectOp2.getOpcode() == ISD::SUB &&
29175 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29176 SelectOp2.getOperand(1) == SelectOp1))
29179 // The first operand of SetCC is the first operand of the select, which is the
29180 // difference between the two input vectors.
29181 if (SetCC.getOperand(0) != SelectOp1)
29184 // In SetLT case, The second operand of the comparison can be either 1 or 0.
29186 if ((CC == ISD::SETLT) &&
29187 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
29189 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29192 // In SetGT case, The second operand of the comparison can be either -1 or 0.
29193 if ((CC == ISD::SETGT) &&
29194 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29195 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29198 // The first operand of the select is the difference between the two input
29200 if (SelectOp1.getOpcode() != ISD::SUB)
29203 Op0 = SelectOp1.getOperand(0);
29204 Op1 = SelectOp1.getOperand(1);
29206 // Check if the operands of the sub are zero-extended from vectors of i8.
29207 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29208 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29209 Op1.getOpcode() != ISD::ZERO_EXTEND ||
29210 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29216 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29218 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29219 const SDValue &Zext1, const SDLoc &DL) {
29221 // Find the appropriate width for the PSADBW.
29222 EVT InVT = Zext0.getOperand(0).getValueType();
29223 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29225 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29226 // fill in the missing vector elements with 0.
29227 unsigned NumConcat = RegSize / InVT.getSizeInBits();
29228 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29229 Ops[0] = Zext0.getOperand(0);
29230 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29231 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29232 Ops[0] = Zext1.getOperand(0);
29233 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29235 // Actually build the SAD
29236 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29237 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29240 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29241 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29243 const X86Subtarget &Subtarget) {
29244 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29245 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
29248 EVT ExtractVT = Extract->getValueType(0);
29249 unsigned BitWidth = ExtractVT.getSizeInBits();
29250 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29251 ExtractVT != MVT::i8)
29254 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29255 for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29256 SDValue Match = matchBinOpReduction(Extract, Op);
29260 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29261 // which we can't support here for now.
29262 if (Match.getScalarValueSizeInBits() != BitWidth)
29265 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29266 unsigned MatchSizeInBits = Match.getValueSizeInBits();
29267 if (!(MatchSizeInBits == 128 ||
29268 (MatchSizeInBits == 256 &&
29269 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
29272 // Don't bother performing this for 2-element vectors.
29273 if (Match.getValueType().getVectorNumElements() <= 2)
29276 // Check that we are extracting a reduction of all sign bits.
29277 if (DAG.ComputeNumSignBits(Match) != BitWidth)
29280 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29282 if (64 == BitWidth || 32 == BitWidth)
29283 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29284 MatchSizeInBits / BitWidth);
29286 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29289 ISD::CondCode CondCode;
29290 if (Op == ISD::OR) {
29291 // any_of -> MOVMSK != 0
29292 CompareBits = APInt::getNullValue(32);
29293 CondCode = ISD::CondCode::SETNE;
29295 // all_of -> MOVMSK == ((1 << NumElts) - 1)
29296 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29297 CondCode = ISD::CondCode::SETEQ;
29300 // Perform the select as i32/i64 and then truncate to avoid partial register
29302 unsigned ResWidth = std::max(BitWidth, 32u);
29303 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29305 SDValue Zero = DAG.getConstant(0, DL, ResVT);
29306 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29307 SDValue Res = DAG.getBitcast(MaskVT, Match);
29308 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29309 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29310 Ones, Zero, CondCode);
29311 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29317 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29318 const X86Subtarget &Subtarget) {
29319 // PSADBW is only supported on SSE2 and up.
29320 if (!Subtarget.hasSSE2())
29323 // Verify the type we're extracting from is any integer type above i16.
29324 EVT VT = Extract->getOperand(0).getValueType();
29325 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
29328 unsigned RegSize = 128;
29329 if (Subtarget.hasBWI())
29331 else if (Subtarget.hasAVX2())
29334 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29335 // TODO: We should be able to handle larger vectors by splitting them before
29336 // feeding them into several SADs, and then reducing over those.
29337 if (RegSize / VT.getVectorNumElements() < 8)
29340 // Match shuffle + add pyramid.
29341 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29343 // The operand is expected to be zero extended from i8
29344 // (verified in detectZextAbsDiff).
29345 // In order to convert to i64 and above, additional any/zero/sign
29346 // extend is expected.
29347 // The zero extend from 32 bit has no mathematical effect on the result.
29348 // Also the sign extend is basically zero extend
29349 // (extends the sign bit which is zero).
29350 // So it is correct to skip the sign/zero extend instruction.
29351 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
29352 Root.getOpcode() == ISD::ZERO_EXTEND ||
29353 Root.getOpcode() == ISD::ANY_EXTEND))
29354 Root = Root.getOperand(0);
29356 // If there was a match, we want Root to be a select that is the root of an
29357 // abs-diff pattern.
29358 if (!Root || (Root.getOpcode() != ISD::VSELECT))
29361 // Check whether we have an abs-diff pattern feeding into the select.
29362 SDValue Zext0, Zext1;
29363 if (!detectZextAbsDiff(Root, Zext0, Zext1))
29366 // Create the SAD instruction.
29368 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29370 // If the original vector was wider than 8 elements, sum over the results
29371 // in the SAD vector.
29372 unsigned Stages = Log2_32(VT.getVectorNumElements());
29373 MVT SadVT = SAD.getSimpleValueType();
29375 unsigned SadElems = SadVT.getVectorNumElements();
29377 for(unsigned i = Stages - 3; i > 0; --i) {
29378 SmallVector<int, 16> Mask(SadElems, -1);
29379 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29380 Mask[j] = MaskEnd + j;
29383 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29384 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29388 MVT Type = Extract->getSimpleValueType(0);
29389 unsigned TypeSizeInBits = Type.getSizeInBits();
29390 // Return the lowest TypeSizeInBits bits.
29391 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29392 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29393 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29394 Extract->getOperand(1));
29397 // Attempt to peek through a target shuffle and extract the scalar from the
29399 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29400 TargetLowering::DAGCombinerInfo &DCI,
29401 const X86Subtarget &Subtarget) {
29402 if (DCI.isBeforeLegalizeOps())
29405 SDValue Src = N->getOperand(0);
29406 SDValue Idx = N->getOperand(1);
29408 EVT VT = N->getValueType(0);
29409 EVT SrcVT = Src.getValueType();
29410 EVT SrcSVT = SrcVT.getVectorElementType();
29411 unsigned NumSrcElts = SrcVT.getVectorNumElements();
29413 // Don't attempt this for boolean mask vectors or unknown extraction indices.
29414 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
29417 // Resolve the target shuffle inputs and mask.
29418 SmallVector<int, 16> Mask;
29419 SmallVector<SDValue, 2> Ops;
29420 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask))
29423 // Attempt to narrow/widen the shuffle mask to the correct size.
29424 if (Mask.size() != NumSrcElts) {
29425 if ((NumSrcElts % Mask.size()) == 0) {
29426 SmallVector<int, 16> ScaledMask;
29427 int Scale = NumSrcElts / Mask.size();
29428 scaleShuffleMask(Scale, Mask, ScaledMask);
29429 Mask = std::move(ScaledMask);
29430 } else if ((Mask.size() % NumSrcElts) == 0) {
29431 SmallVector<int, 16> WidenedMask;
29432 while (Mask.size() > NumSrcElts &&
29433 canWidenShuffleElements(Mask, WidenedMask))
29434 Mask = std::move(WidenedMask);
29435 // TODO - investigate support for wider shuffle masks with known upper
29436 // undef/zero elements for implicit zero-extension.
29440 // Check if narrowing/widening failed.
29441 if (Mask.size() != NumSrcElts)
29444 int SrcIdx = Mask[N->getConstantOperandVal(1)];
29447 // If the shuffle source element is undef/zero then we can just accept it.
29448 if (SrcIdx == SM_SentinelUndef)
29449 return DAG.getUNDEF(VT);
29451 if (SrcIdx == SM_SentinelZero)
29452 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29453 : DAG.getConstant(0, dl, VT);
29455 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29456 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29457 SrcIdx = SrcIdx % Mask.size();
29459 // We can only extract other elements from 128-bit vectors and in certain
29460 // circumstances, depending on SSE-level.
29461 // TODO: Investigate using extract_subvector for larger vectors.
29462 // TODO: Investigate float/double extraction if it will be just stored.
29463 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
29464 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
29465 assert(SrcSVT == VT && "Unexpected extraction type");
29466 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29467 DAG.getIntPtrConstant(SrcIdx, dl));
29470 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
29471 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29472 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29473 "Unexpected extraction type");
29474 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29475 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29476 DAG.getIntPtrConstant(SrcIdx, dl));
29477 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29478 DAG.getValueType(SrcSVT));
29479 return DAG.getZExtOrTrunc(Assert, dl, VT);
29485 /// Detect vector gather/scatter index generation and convert it from being a
29486 /// bunch of shuffles and extracts into a somewhat faster sequence.
29487 /// For i686, the best sequence is apparently storing the value and loading
29488 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
29489 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29490 TargetLowering::DAGCombinerInfo &DCI,
29491 const X86Subtarget &Subtarget) {
29492 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29495 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29498 SDValue InputVector = N->getOperand(0);
29499 SDValue EltIdx = N->getOperand(1);
29501 EVT SrcVT = InputVector.getValueType();
29502 EVT VT = N->getValueType(0);
29503 SDLoc dl(InputVector);
29505 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29506 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29507 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29508 SDValue MMXSrc = InputVector.getOperand(0);
29510 // The bitcast source is a direct mmx result.
29511 if (MMXSrc.getValueType() == MVT::x86mmx)
29512 return DAG.getBitcast(VT, InputVector);
29515 // Detect mmx to i32 conversion through a v2i32 elt extract.
29516 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29517 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29518 SDValue MMXSrc = InputVector.getOperand(0);
29520 // The bitcast source is a direct mmx result.
29521 if (MMXSrc.getValueType() == MVT::x86mmx)
29522 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29525 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29526 isa<ConstantSDNode>(EltIdx) &&
29527 isa<ConstantSDNode>(InputVector.getOperand(0))) {
29528 uint64_t ExtractedElt = N->getConstantOperandVal(1);
29529 uint64_t InputValue = InputVector.getConstantOperandVal(0);
29530 uint64_t Res = (InputValue >> ExtractedElt) & 1;
29531 return DAG.getConstant(Res, dl, MVT::i1);
29534 // Check whether this extract is the root of a sum of absolute differences
29535 // pattern. This has to be done here because we really want it to happen
29536 // pre-legalization,
29537 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29540 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29541 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29544 // Only operate on vectors of 4 elements, where the alternative shuffling
29545 // gets to be more expensive.
29546 if (SrcVT != MVT::v4i32)
29549 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29550 // single use which is a sign-extend or zero-extend, and all elements are
29552 SmallVector<SDNode *, 4> Uses;
29553 unsigned ExtractedElements = 0;
29554 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29555 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29556 if (UI.getUse().getResNo() != InputVector.getResNo())
29559 SDNode *Extract = *UI;
29560 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29563 if (Extract->getValueType(0) != MVT::i32)
29565 if (!Extract->hasOneUse())
29567 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29568 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29570 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29573 // Record which element was extracted.
29574 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29575 Uses.push_back(Extract);
29578 // If not all the elements were used, this may not be worthwhile.
29579 if (ExtractedElements != 15)
29582 // Ok, we've now decided to do the transformation.
29583 // If 64-bit shifts are legal, use the extract-shift sequence,
29584 // otherwise bounce the vector off the cache.
29585 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29588 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29589 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29590 auto &DL = DAG.getDataLayout();
29591 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29592 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29593 DAG.getConstant(0, dl, VecIdxTy));
29594 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29595 DAG.getConstant(1, dl, VecIdxTy));
29597 SDValue ShAmt = DAG.getConstant(
29598 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29599 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29600 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29601 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29602 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29603 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29604 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29606 // Store the value to a temporary stack slot.
29607 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29608 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29609 MachinePointerInfo());
29611 EVT ElementType = SrcVT.getVectorElementType();
29612 unsigned EltSize = ElementType.getSizeInBits() / 8;
29614 // Replace each use (extract) with a load of the appropriate element.
29615 for (unsigned i = 0; i < 4; ++i) {
29616 uint64_t Offset = EltSize * i;
29617 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29618 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29620 SDValue ScalarAddr =
29621 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29623 // Load the scalar.
29625 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29629 // Replace the extracts
29630 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
29631 UE = Uses.end(); UI != UE; ++UI) {
29632 SDNode *Extract = *UI;
29634 uint64_t IdxVal = Extract->getConstantOperandVal(1);
29635 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
29638 // The replacement was made in place; don't return anything.
29642 // TODO - merge with combineExtractVectorElt once it can handle the implicit
29643 // zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
29644 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
29645 // combineBasicSADPattern.
29646 static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
29647 TargetLowering::DAGCombinerInfo &DCI,
29648 const X86Subtarget &Subtarget) {
29649 return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
29652 /// If a vector select has an operand that is -1 or 0, try to simplify the
29653 /// select to a bitwise logic operation.
29655 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
29656 TargetLowering::DAGCombinerInfo &DCI,
29657 const X86Subtarget &Subtarget) {
29658 SDValue Cond = N->getOperand(0);
29659 SDValue LHS = N->getOperand(1);
29660 SDValue RHS = N->getOperand(2);
29661 EVT VT = LHS.getValueType();
29662 EVT CondVT = Cond.getValueType();
29664 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29666 if (N->getOpcode() != ISD::VSELECT)
29669 assert(CondVT.isVector() && "Vector select expects a vector selector!");
29671 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29672 // Check if the first operand is all zeros and Cond type is vXi1.
29673 // This situation only applies to avx512.
29674 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
29675 CondVT.getVectorElementType() == MVT::i1) {
29676 // Invert the cond to not(cond) : xor(op,allones)=not(op)
29677 SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
29678 DAG.getAllOnesConstant(DL, CondVT));
29679 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
29680 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
29683 // To use the condition operand as a bitwise mask, it must have elements that
29684 // are the same size as the select elements. Ie, the condition operand must
29685 // have already been promoted from the IR select condition type <N x i1>.
29686 // Don't check if the types themselves are equal because that excludes
29687 // vector floating-point selects.
29688 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
29691 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
29692 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
29694 // Try to invert the condition if true value is not all 1s and false value is
29696 if (!TValIsAllOnes && !FValIsAllZeros &&
29697 // Check if the selector will be produced by CMPP*/PCMP*.
29698 Cond.getOpcode() == ISD::SETCC &&
29699 // Check if SETCC has already been promoted.
29700 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
29702 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29703 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
29705 if (TValIsAllZeros || FValIsAllOnes) {
29706 SDValue CC = Cond.getOperand(2);
29707 ISD::CondCode NewCC =
29708 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
29709 Cond.getOperand(0).getValueType().isInteger());
29710 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
29712 std::swap(LHS, RHS);
29713 TValIsAllOnes = FValIsAllOnes;
29714 FValIsAllZeros = TValIsAllZeros;
29718 // vselect Cond, 111..., 000... -> Cond
29719 if (TValIsAllOnes && FValIsAllZeros)
29720 return DAG.getBitcast(VT, Cond);
29722 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
29725 // vselect Cond, 111..., X -> or Cond, X
29726 if (TValIsAllOnes) {
29727 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
29728 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
29729 return DAG.getBitcast(VT, Or);
29732 // vselect Cond, X, 000... -> and Cond, X
29733 if (FValIsAllZeros) {
29734 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
29735 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
29736 return DAG.getBitcast(VT, And);
29742 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
29743 SDValue Cond = N->getOperand(0);
29744 SDValue LHS = N->getOperand(1);
29745 SDValue RHS = N->getOperand(2);
29748 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
29749 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
29750 if (!TrueC || !FalseC)
29753 // Don't do this for crazy integer types.
29754 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
29757 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
29758 // so that TrueC (the true value) is larger than FalseC.
29759 bool NeedsCondInvert = false;
29760 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
29761 // Efficiently invertible.
29762 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
29763 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
29764 isa<ConstantSDNode>(Cond.getOperand(1))))) {
29765 NeedsCondInvert = true;
29766 std::swap(TrueC, FalseC);
29769 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
29770 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
29771 if (NeedsCondInvert) // Invert the condition if needed.
29772 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29773 DAG.getConstant(1, DL, Cond.getValueType()));
29775 // Zero extend the condition if needed.
29776 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
29778 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
29779 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
29780 DAG.getConstant(ShAmt, DL, MVT::i8));
29783 // Optimize cases that will turn into an LEA instruction. This requires
29784 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
29785 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
29786 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
29787 if (N->getValueType(0) == MVT::i32)
29788 Diff = (unsigned)Diff;
29790 bool isFastMultiplier = false;
29792 switch ((unsigned char)Diff) {
29795 case 1: // result = add base, cond
29796 case 2: // result = lea base( , cond*2)
29797 case 3: // result = lea base(cond, cond*2)
29798 case 4: // result = lea base( , cond*4)
29799 case 5: // result = lea base(cond, cond*4)
29800 case 8: // result = lea base( , cond*8)
29801 case 9: // result = lea base(cond, cond*8)
29802 isFastMultiplier = true;
29807 if (isFastMultiplier) {
29808 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
29809 if (NeedsCondInvert) // Invert the condition if needed.
29810 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29811 DAG.getConstant(1, DL, Cond.getValueType()));
29813 // Zero extend the condition if needed.
29814 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
29815 // Scale the condition by the difference.
29817 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
29818 DAG.getConstant(Diff, DL, Cond.getValueType()));
29820 // Add the base if non-zero.
29821 if (FalseC->getAPIntValue() != 0)
29822 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29823 SDValue(FalseC, 0));
29831 // If this is a bitcasted op that can be represented as another type, push the
29832 // the bitcast to the inputs. This allows more opportunities for pattern
29833 // matching masked instructions. This is called when we know that the operation
29834 // is used as one of the inputs of a vselect.
29835 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
29836 TargetLowering::DAGCombinerInfo &DCI) {
29837 // Make sure we have a bitcast.
29838 if (OrigOp.getOpcode() != ISD::BITCAST)
29841 SDValue Op = OrigOp.getOperand(0);
29843 // If the operation is used by anything other than the bitcast, we shouldn't
29844 // do this combine as that would replicate the operation.
29845 if (!Op.hasOneUse())
29848 MVT VT = OrigOp.getSimpleValueType();
29849 MVT EltVT = VT.getVectorElementType();
29850 SDLoc DL(Op.getNode());
29852 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
29854 Op0 = DAG.getBitcast(VT, Op0);
29855 DCI.AddToWorklist(Op0.getNode());
29856 Op1 = DAG.getBitcast(VT, Op1);
29857 DCI.AddToWorklist(Op1.getNode());
29858 DCI.CombineTo(OrigOp.getNode(),
29859 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
29863 unsigned Opcode = Op.getOpcode();
29865 case X86ISD::PALIGNR:
29866 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
29867 if (!VT.is128BitVector())
29869 Opcode = X86ISD::VALIGN;
29871 case X86ISD::VALIGN: {
29872 if (EltVT != MVT::i32 && EltVT != MVT::i64)
29874 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29875 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29876 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
29877 unsigned EltSize = EltVT.getSizeInBits();
29878 // Make sure we can represent the same shift with the new VT.
29879 if ((ShiftAmt % EltSize) != 0)
29881 Imm = ShiftAmt / EltSize;
29882 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29883 DAG.getConstant(Imm, DL, MVT::i8));
29885 case X86ISD::SHUF128: {
29886 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
29888 // Only change element size, not type.
29889 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29891 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29894 case ISD::INSERT_SUBVECTOR: {
29895 unsigned EltSize = EltVT.getSizeInBits();
29896 if (EltSize != 32 && EltSize != 64)
29898 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29899 // Only change element size, not type.
29900 if (EltVT.isInteger() != OpEltVT.isInteger())
29902 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29903 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29904 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
29905 DCI.AddToWorklist(Op0.getNode());
29906 // Op1 needs to be bitcasted to a smaller vector with the same element type.
29907 SDValue Op1 = Op.getOperand(1);
29908 MVT Op1VT = MVT::getVectorVT(EltVT,
29909 Op1.getSimpleValueType().getSizeInBits() / EltSize);
29910 Op1 = DAG.getBitcast(Op1VT, Op1);
29911 DCI.AddToWorklist(Op1.getNode());
29912 DCI.CombineTo(OrigOp.getNode(),
29913 DAG.getNode(Opcode, DL, VT, Op0, Op1,
29914 DAG.getIntPtrConstant(Imm, DL)));
29917 case ISD::EXTRACT_SUBVECTOR: {
29918 unsigned EltSize = EltVT.getSizeInBits();
29919 if (EltSize != 32 && EltSize != 64)
29921 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29922 // Only change element size, not type.
29923 if (EltVT.isInteger() != OpEltVT.isInteger())
29925 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
29926 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29927 // Op0 needs to be bitcasted to a larger vector with the same element type.
29928 SDValue Op0 = Op.getOperand(0);
29929 MVT Op0VT = MVT::getVectorVT(EltVT,
29930 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29931 Op0 = DAG.getBitcast(Op0VT, Op0);
29932 DCI.AddToWorklist(Op0.getNode());
29933 DCI.CombineTo(OrigOp.getNode(),
29934 DAG.getNode(Opcode, DL, VT, Op0,
29935 DAG.getIntPtrConstant(Imm, DL)));
29938 case X86ISD::SUBV_BROADCAST: {
29939 unsigned EltSize = EltVT.getSizeInBits();
29940 if (EltSize != 32 && EltSize != 64)
29942 // Only change element size, not type.
29943 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29945 SDValue Op0 = Op.getOperand(0);
29946 MVT Op0VT = MVT::getVectorVT(EltVT,
29947 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29948 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
29949 DCI.AddToWorklist(Op0.getNode());
29950 DCI.CombineTo(OrigOp.getNode(),
29951 DAG.getNode(Opcode, DL, VT, Op0));
29959 /// Do target-specific dag combines on SELECT and VSELECT nodes.
29960 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
29961 TargetLowering::DAGCombinerInfo &DCI,
29962 const X86Subtarget &Subtarget) {
29964 SDValue Cond = N->getOperand(0);
29965 // Get the LHS/RHS of the select.
29966 SDValue LHS = N->getOperand(1);
29967 SDValue RHS = N->getOperand(2);
29968 EVT VT = LHS.getValueType();
29969 EVT CondVT = Cond.getValueType();
29970 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29972 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
29973 // instructions match the semantics of the common C idiom x<y?x:y but not
29974 // x<=y?x:y, because of how they handle negative zero (which can be
29975 // ignored in unsafe-math mode).
29976 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
29977 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
29978 VT != MVT::f80 && VT != MVT::f128 &&
29979 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
29980 (Subtarget.hasSSE2() ||
29981 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
29982 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
29984 unsigned Opcode = 0;
29985 // Check for x CC y ? x : y.
29986 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
29987 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
29991 // Converting this to a min would handle NaNs incorrectly, and swapping
29992 // the operands would cause it to handle comparisons between positive
29993 // and negative zero incorrectly.
29994 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29995 if (!DAG.getTarget().Options.UnsafeFPMath &&
29996 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29998 std::swap(LHS, RHS);
30000 Opcode = X86ISD::FMIN;
30003 // Converting this to a min would handle comparisons between positive
30004 // and negative zero incorrectly.
30005 if (!DAG.getTarget().Options.UnsafeFPMath &&
30006 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30008 Opcode = X86ISD::FMIN;
30011 // Converting this to a min would handle both negative zeros and NaNs
30012 // incorrectly, but we can swap the operands to fix both.
30013 std::swap(LHS, RHS);
30018 Opcode = X86ISD::FMIN;
30022 // Converting this to a max would handle comparisons between positive
30023 // and negative zero incorrectly.
30024 if (!DAG.getTarget().Options.UnsafeFPMath &&
30025 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30027 Opcode = X86ISD::FMAX;
30030 // Converting this to a max would handle NaNs incorrectly, and swapping
30031 // the operands would cause it to handle comparisons between positive
30032 // and negative zero incorrectly.
30033 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
30034 if (!DAG.getTarget().Options.UnsafeFPMath &&
30035 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
30037 std::swap(LHS, RHS);
30039 Opcode = X86ISD::FMAX;
30042 // Converting this to a max would handle both negative zeros and NaNs
30043 // incorrectly, but we can swap the operands to fix both.
30044 std::swap(LHS, RHS);
30049 Opcode = X86ISD::FMAX;
30052 // Check for x CC y ? y : x -- a min/max with reversed arms.
30053 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
30054 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
30058 // Converting this to a min would handle comparisons between positive
30059 // and negative zero incorrectly, and swapping the operands would
30060 // cause it to handle NaNs incorrectly.
30061 if (!DAG.getTarget().Options.UnsafeFPMath &&
30062 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
30063 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30065 std::swap(LHS, RHS);
30067 Opcode = X86ISD::FMIN;
30070 // Converting this to a min would handle NaNs incorrectly.
30071 if (!DAG.getTarget().Options.UnsafeFPMath &&
30072 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
30074 Opcode = X86ISD::FMIN;
30077 // Converting this to a min would handle both negative zeros and NaNs
30078 // incorrectly, but we can swap the operands to fix both.
30079 std::swap(LHS, RHS);
30084 Opcode = X86ISD::FMIN;
30088 // Converting this to a max would handle NaNs incorrectly.
30089 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30091 Opcode = X86ISD::FMAX;
30094 // Converting this to a max would handle comparisons between positive
30095 // and negative zero incorrectly, and swapping the operands would
30096 // cause it to handle NaNs incorrectly.
30097 if (!DAG.getTarget().Options.UnsafeFPMath &&
30098 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
30099 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30101 std::swap(LHS, RHS);
30103 Opcode = X86ISD::FMAX;
30106 // Converting this to a max would handle both negative zeros and NaNs
30107 // incorrectly, but we can swap the operands to fix both.
30108 std::swap(LHS, RHS);
30113 Opcode = X86ISD::FMAX;
30119 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30122 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30123 // lowering on KNL. In this case we convert it to
30124 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30125 // The same situation for all 128 and 256-bit vectors of i8 and i16.
30126 // Since SKX these selects have a proper lowering.
30127 if (Subtarget.hasAVX512() && CondVT.isVector() &&
30128 CondVT.getVectorElementType() == MVT::i1 &&
30129 (VT.is128BitVector() || VT.is256BitVector()) &&
30130 (VT.getVectorElementType() == MVT::i8 ||
30131 VT.getVectorElementType() == MVT::i16) &&
30132 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
30133 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30134 DCI.AddToWorklist(Cond.getNode());
30135 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30138 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
30141 // Canonicalize max and min:
30142 // (x > y) ? x : y -> (x >= y) ? x : y
30143 // (x < y) ? x : y -> (x <= y) ? x : y
30144 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30145 // the need for an extra compare
30146 // against zero. e.g.
30147 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30149 // testl %edi, %edi
30151 // cmovgl %edi, %eax
30155 // cmovsl %eax, %edi
30156 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30157 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30158 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30159 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30164 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30165 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30166 Cond.getOperand(0), Cond.getOperand(1), NewCC);
30167 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
30172 // Early exit check
30173 if (!TLI.isTypeLegal(VT))
30176 // Match VSELECTs into subs with unsigned saturation.
30177 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30178 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30179 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
30180 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
30181 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30183 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30184 // left side invert the predicate to simplify logic below.
30186 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30188 CC = ISD::getSetCCInverse(CC, true);
30189 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30193 if (Other.getNode() && Other->getNumOperands() == 2 &&
30194 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30195 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30196 SDValue CondRHS = Cond->getOperand(1);
30198 // Look for a general sub with unsigned saturation first.
30199 // x >= y ? x-y : 0 --> subus x, y
30200 // x > y ? x-y : 0 --> subus x, y
30201 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
30202 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30203 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30205 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30206 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30207 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30208 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30209 // If the RHS is a constant we have to reverse the const
30210 // canonicalization.
30211 // x > C-1 ? x+-C : 0 --> subus x, C
30212 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30213 CondRHSConst->getAPIntValue() ==
30214 (-OpRHSConst->getAPIntValue() - 1))
30215 return DAG.getNode(
30216 X86ISD::SUBUS, DL, VT, OpLHS,
30217 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30219 // Another special case: If C was a sign bit, the sub has been
30220 // canonicalized into a xor.
30221 // FIXME: Would it be better to use computeKnownBits to determine
30222 // whether it's safe to decanonicalize the xor?
30223 // x s< 0 ? x^C : 0 --> subus x, C
30224 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30225 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30226 OpRHSConst->getAPIntValue().isSignMask())
30227 // Note that we have to rebuild the RHS constant here to ensure we
30228 // don't rely on particular values of undef lanes.
30229 return DAG.getNode(
30230 X86ISD::SUBUS, DL, VT, OpLHS,
30231 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30236 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30239 // If this is a *dynamic* select (non-constant condition) and we can match
30240 // this node with one of the variable blend instructions, restructure the
30241 // condition so that blends can use the high (sign) bit of each element and
30242 // use SimplifyDemandedBits to simplify the condition operand.
30243 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30244 !DCI.isBeforeLegalize() &&
30245 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30246 unsigned BitWidth = Cond.getScalarValueSizeInBits();
30248 // Don't optimize vector selects that map to mask-registers.
30252 // We can only handle the cases where VSELECT is directly legal on the
30253 // subtarget. We custom lower VSELECT nodes with constant conditions and
30254 // this makes it hard to see whether a dynamic VSELECT will correctly
30255 // lower, so we both check the operation's status and explicitly handle the
30256 // cases where a *dynamic* blend will fail even though a constant-condition
30257 // blend could be custom lowered.
30258 // FIXME: We should find a better way to handle this class of problems.
30259 // Potentially, we should combine constant-condition vselect nodes
30260 // pre-legalization into shuffles and not mark as many types as custom
30262 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30264 // FIXME: We don't support i16-element blends currently. We could and
30265 // should support them by making *all* the bits in the condition be set
30266 // rather than just the high bit and using an i8-element blend.
30267 if (VT.getVectorElementType() == MVT::i16)
30269 // Dynamic blending was only available from SSE4.1 onward.
30270 if (VT.is128BitVector() && !Subtarget.hasSSE41())
30272 // Byte blends are only available in AVX2
30273 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30276 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30277 APInt DemandedMask(APInt::getSignMask(BitWidth));
30279 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
30280 DCI.isBeforeLegalizeOps());
30281 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
30282 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
30283 // If we changed the computation somewhere in the DAG, this change will
30284 // affect all users of Cond. Make sure it is fine and update all the nodes
30285 // so that we do not use the generic VSELECT anymore. Otherwise, we may
30286 // perform wrong optimizations as we messed with the actual expectation
30287 // for the vector boolean values.
30288 if (Cond != TLO.Old) {
30289 // Check all uses of the condition operand to check whether it will be
30290 // consumed by non-BLEND instructions. Those may require that all bits
30291 // are set properly.
30292 for (SDNode *U : Cond->uses()) {
30293 // TODO: Add other opcodes eventually lowered into BLEND.
30294 if (U->getOpcode() != ISD::VSELECT)
30298 // Update all users of the condition before committing the change, so
30299 // that the VSELECT optimizations that expect the correct vector boolean
30300 // value will not be triggered.
30301 for (SDNode *U : Cond->uses()) {
30302 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30303 U->getValueType(0), Cond, U->getOperand(1),
30305 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30307 DCI.CommitTargetLoweringOpt(TLO);
30310 // Only Cond (rather than other nodes in the computation chain) was
30311 // changed. Change the condition just for N to keep the opportunity to
30312 // optimize all other users their own way.
30313 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30314 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30319 // Look for vselects with LHS/RHS being bitcasted from an operation that
30320 // can be executed on another type. Push the bitcast to the inputs of
30321 // the operation. This exposes opportunities for using masking instructions.
30322 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30323 CondVT.getVectorElementType() == MVT::i1) {
30324 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30325 return SDValue(N, 0);
30326 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30327 return SDValue(N, 0);
30334 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30336 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30337 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30338 /// Note that this is only legal for some op/cc combinations.
30339 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30340 SelectionDAG &DAG) {
30341 // This combine only operates on CMP-like nodes.
30342 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30343 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30346 // Can't replace the cmp if it has more uses than the one we're looking at.
30347 // FIXME: We would like to be able to handle this, but would need to make sure
30348 // all uses were updated.
30349 if (!Cmp.hasOneUse())
30352 // This only applies to variations of the common case:
30353 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30354 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30355 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30356 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30357 // Using the proper condcodes (see below), overflow is checked for.
30359 // FIXME: We can generalize both constraints:
30360 // - XOR/OR/AND (if they were made to survive AtomicExpand)
30362 // if the result is compared.
30364 SDValue CmpLHS = Cmp.getOperand(0);
30365 SDValue CmpRHS = Cmp.getOperand(1);
30367 if (!CmpLHS.hasOneUse())
30370 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30371 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
30374 const unsigned Opc = CmpLHS.getOpcode();
30376 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30379 SDValue OpRHS = CmpLHS.getOperand(2);
30380 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30384 APInt Addend = OpRHSC->getAPIntValue();
30385 if (Opc == ISD::ATOMIC_LOAD_SUB)
30388 if (CC == X86::COND_S && Addend == 1)
30390 else if (CC == X86::COND_NS && Addend == 1)
30392 else if (CC == X86::COND_G && Addend == -1)
30394 else if (CC == X86::COND_LE && Addend == -1)
30399 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30400 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30401 DAG.getUNDEF(CmpLHS.getValueType()));
30402 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30406 // Check whether a boolean test is testing a boolean value generated by
30407 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30410 // Simplify the following patterns:
30411 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30412 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30413 // to (Op EFLAGS Cond)
30415 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30416 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30417 // to (Op EFLAGS !Cond)
30419 // where Op could be BRCOND or CMOV.
30421 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30422 // This combine only operates on CMP-like nodes.
30423 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30424 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30427 // Quit if not used as a boolean value.
30428 if (CC != X86::COND_E && CC != X86::COND_NE)
30431 // Check CMP operands. One of them should be 0 or 1 and the other should be
30432 // an SetCC or extended from it.
30433 SDValue Op1 = Cmp.getOperand(0);
30434 SDValue Op2 = Cmp.getOperand(1);
30437 const ConstantSDNode* C = nullptr;
30438 bool needOppositeCond = (CC == X86::COND_E);
30439 bool checkAgainstTrue = false; // Is it a comparison against 1?
30441 if ((C = dyn_cast<ConstantSDNode>(Op1)))
30443 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30445 else // Quit if all operands are not constants.
30448 if (C->getZExtValue() == 1) {
30449 needOppositeCond = !needOppositeCond;
30450 checkAgainstTrue = true;
30451 } else if (C->getZExtValue() != 0)
30452 // Quit if the constant is neither 0 or 1.
30455 bool truncatedToBoolWithAnd = false;
30456 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30457 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30458 SetCC.getOpcode() == ISD::TRUNCATE ||
30459 SetCC.getOpcode() == ISD::AND) {
30460 if (SetCC.getOpcode() == ISD::AND) {
30462 if (isOneConstant(SetCC.getOperand(0)))
30464 if (isOneConstant(SetCC.getOperand(1)))
30468 SetCC = SetCC.getOperand(OpIdx);
30469 truncatedToBoolWithAnd = true;
30471 SetCC = SetCC.getOperand(0);
30474 switch (SetCC.getOpcode()) {
30475 case X86ISD::SETCC_CARRY:
30476 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30477 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30478 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30479 // truncated to i1 using 'and'.
30480 if (checkAgainstTrue && !truncatedToBoolWithAnd)
30482 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30483 "Invalid use of SETCC_CARRY!");
30485 case X86ISD::SETCC:
30486 // Set the condition code or opposite one if necessary.
30487 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30488 if (needOppositeCond)
30489 CC = X86::GetOppositeBranchCondition(CC);
30490 return SetCC.getOperand(1);
30491 case X86ISD::CMOV: {
30492 // Check whether false/true value has canonical one, i.e. 0 or 1.
30493 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30494 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30495 // Quit if true value is not a constant.
30498 // Quit if false value is not a constant.
30500 SDValue Op = SetCC.getOperand(0);
30501 // Skip 'zext' or 'trunc' node.
30502 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30503 Op.getOpcode() == ISD::TRUNCATE)
30504 Op = Op.getOperand(0);
30505 // A special case for rdrand/rdseed, where 0 is set if false cond is
30507 if ((Op.getOpcode() != X86ISD::RDRAND &&
30508 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
30511 // Quit if false value is not the constant 0 or 1.
30512 bool FValIsFalse = true;
30513 if (FVal && FVal->getZExtValue() != 0) {
30514 if (FVal->getZExtValue() != 1)
30516 // If FVal is 1, opposite cond is needed.
30517 needOppositeCond = !needOppositeCond;
30518 FValIsFalse = false;
30520 // Quit if TVal is not the constant opposite of FVal.
30521 if (FValIsFalse && TVal->getZExtValue() != 1)
30523 if (!FValIsFalse && TVal->getZExtValue() != 0)
30525 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30526 if (needOppositeCond)
30527 CC = X86::GetOppositeBranchCondition(CC);
30528 return SetCC.getOperand(3);
30535 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30537 /// (X86or (X86setcc) (X86setcc))
30538 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
30539 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30540 X86::CondCode &CC1, SDValue &Flags,
30542 if (Cond->getOpcode() == X86ISD::CMP) {
30543 if (!isNullConstant(Cond->getOperand(1)))
30546 Cond = Cond->getOperand(0);
30551 SDValue SetCC0, SetCC1;
30552 switch (Cond->getOpcode()) {
30553 default: return false;
30560 SetCC0 = Cond->getOperand(0);
30561 SetCC1 = Cond->getOperand(1);
30565 // Make sure we have SETCC nodes, using the same flags value.
30566 if (SetCC0.getOpcode() != X86ISD::SETCC ||
30567 SetCC1.getOpcode() != X86ISD::SETCC ||
30568 SetCC0->getOperand(1) != SetCC1->getOperand(1))
30571 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30572 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30573 Flags = SetCC0->getOperand(1);
30577 /// Optimize an EFLAGS definition used according to the condition code \p CC
30578 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30579 /// uses of chain values.
30580 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30581 SelectionDAG &DAG) {
30582 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30584 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30587 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30588 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30589 TargetLowering::DAGCombinerInfo &DCI,
30590 const X86Subtarget &Subtarget) {
30593 // If the flag operand isn't dead, don't touch this CMOV.
30594 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
30597 SDValue FalseOp = N->getOperand(0);
30598 SDValue TrueOp = N->getOperand(1);
30599 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
30600 SDValue Cond = N->getOperand(3);
30602 if (CC == X86::COND_E || CC == X86::COND_NE) {
30603 switch (Cond.getOpcode()) {
30607 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
30608 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
30609 return (CC == X86::COND_E) ? FalseOp : TrueOp;
30613 // Try to simplify the EFLAGS and condition code operands.
30614 // We can't always do this as FCMOV only supports a subset of X86 cond.
30615 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
30616 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
30617 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
30619 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30623 // If this is a select between two integer constants, try to do some
30624 // optimizations. Note that the operands are ordered the opposite of SELECT
30626 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
30627 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
30628 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
30629 // larger than FalseC (the false value).
30630 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
30631 CC = X86::GetOppositeBranchCondition(CC);
30632 std::swap(TrueC, FalseC);
30633 std::swap(TrueOp, FalseOp);
30636 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
30637 // This is efficient for any integer data type (including i8/i16) and
30639 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30640 Cond = getSETCC(CC, Cond, DL, DAG);
30642 // Zero extend the condition if needed.
30643 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
30645 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30646 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
30647 DAG.getConstant(ShAmt, DL, MVT::i8));
30648 if (N->getNumValues() == 2) // Dead flag value?
30649 return DCI.CombineTo(N, Cond, SDValue());
30653 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
30654 // for any integer data type, including i8/i16.
30655 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
30656 Cond = getSETCC(CC, Cond, DL, DAG);
30658 // Zero extend the condition if needed.
30659 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
30660 FalseC->getValueType(0), Cond);
30661 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30662 SDValue(FalseC, 0));
30664 if (N->getNumValues() == 2) // Dead flag value?
30665 return DCI.CombineTo(N, Cond, SDValue());
30669 // Optimize cases that will turn into an LEA instruction. This requires
30670 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30671 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
30672 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
30673 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
30675 bool isFastMultiplier = false;
30677 switch ((unsigned char)Diff) {
30679 case 1: // result = add base, cond
30680 case 2: // result = lea base( , cond*2)
30681 case 3: // result = lea base(cond, cond*2)
30682 case 4: // result = lea base( , cond*4)
30683 case 5: // result = lea base(cond, cond*4)
30684 case 8: // result = lea base( , cond*8)
30685 case 9: // result = lea base(cond, cond*8)
30686 isFastMultiplier = true;
30691 if (isFastMultiplier) {
30692 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
30693 Cond = getSETCC(CC, Cond, DL ,DAG);
30694 // Zero extend the condition if needed.
30695 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
30697 // Scale the condition by the difference.
30699 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30700 DAG.getConstant(Diff, DL, Cond.getValueType()));
30702 // Add the base if non-zero.
30703 if (FalseC->getAPIntValue() != 0)
30704 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30705 SDValue(FalseC, 0));
30706 if (N->getNumValues() == 2) // Dead flag value?
30707 return DCI.CombineTo(N, Cond, SDValue());
30714 // Handle these cases:
30715 // (select (x != c), e, c) -> select (x != c), e, x),
30716 // (select (x == c), c, e) -> select (x == c), x, e)
30717 // where the c is an integer constant, and the "select" is the combination
30718 // of CMOV and CMP.
30720 // The rationale for this change is that the conditional-move from a constant
30721 // needs two instructions, however, conditional-move from a register needs
30722 // only one instruction.
30724 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
30725 // some instruction-combining opportunities. This opt needs to be
30726 // postponed as late as possible.
30728 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
30729 // the DCI.xxxx conditions are provided to postpone the optimization as
30730 // late as possible.
30732 ConstantSDNode *CmpAgainst = nullptr;
30733 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
30734 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
30735 !isa<ConstantSDNode>(Cond.getOperand(0))) {
30737 if (CC == X86::COND_NE &&
30738 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
30739 CC = X86::GetOppositeBranchCondition(CC);
30740 std::swap(TrueOp, FalseOp);
30743 if (CC == X86::COND_E &&
30744 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
30745 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
30746 DAG.getConstant(CC, DL, MVT::i8), Cond };
30747 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
30752 // Fold and/or of setcc's to double CMOV:
30753 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
30754 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
30756 // This combine lets us generate:
30757 // cmovcc1 (jcc1 if we don't have CMOV)
30763 // cmovne (jne if we don't have CMOV)
30764 // When we can't use the CMOV instruction, it might increase branch
30766 // When we can use CMOV, or when there is no mispredict, this improves
30767 // throughput and reduces register pressure.
30769 if (CC == X86::COND_NE) {
30771 X86::CondCode CC0, CC1;
30773 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
30775 std::swap(FalseOp, TrueOp);
30776 CC0 = X86::GetOppositeBranchCondition(CC0);
30777 CC1 = X86::GetOppositeBranchCondition(CC1);
30780 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
30782 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
30783 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
30784 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30785 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
30793 /// Different mul shrinking modes.
30794 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
30796 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
30797 EVT VT = N->getOperand(0).getValueType();
30798 if (VT.getScalarSizeInBits() != 32)
30801 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
30802 unsigned SignBits[2] = {1, 1};
30803 bool IsPositive[2] = {false, false};
30804 for (unsigned i = 0; i < 2; i++) {
30805 SDValue Opd = N->getOperand(i);
30807 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
30808 // compute signbits for it separately.
30809 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
30810 // For anyextend, it is safe to assume an appropriate number of leading
30812 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
30814 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
30819 IsPositive[i] = true;
30820 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
30821 // All the operands of BUILD_VECTOR need to be int constant.
30822 // Find the smallest value range which all the operands belong to.
30824 IsPositive[i] = true;
30825 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
30826 if (SubOp.isUndef())
30828 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
30831 APInt IntVal = CN->getAPIntValue();
30832 if (IntVal.isNegative())
30833 IsPositive[i] = false;
30834 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
30837 SignBits[i] = DAG.ComputeNumSignBits(Opd);
30838 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
30839 IsPositive[i] = true;
30843 bool AllPositive = IsPositive[0] && IsPositive[1];
30844 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
30845 // When ranges are from -128 ~ 127, use MULS8 mode.
30846 if (MinSignBits >= 25)
30848 // When ranges are from 0 ~ 255, use MULU8 mode.
30849 else if (AllPositive && MinSignBits >= 24)
30851 // When ranges are from -32768 ~ 32767, use MULS16 mode.
30852 else if (MinSignBits >= 17)
30854 // When ranges are from 0 ~ 65535, use MULU16 mode.
30855 else if (AllPositive && MinSignBits >= 16)
30862 /// When the operands of vector mul are extended from smaller size values,
30863 /// like i8 and i16, the type of mul may be shrinked to generate more
30864 /// efficient code. Two typical patterns are handled:
30866 /// %2 = sext/zext <N x i8> %1 to <N x i32>
30867 /// %4 = sext/zext <N x i8> %3 to <N x i32>
30868 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30869 /// %5 = mul <N x i32> %2, %4
30872 /// %2 = zext/sext <N x i16> %1 to <N x i32>
30873 /// %4 = zext/sext <N x i16> %3 to <N x i32>
30874 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30875 /// %5 = mul <N x i32> %2, %4
30877 /// There are four mul shrinking modes:
30878 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
30879 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
30880 /// generate pmullw+sext32 for it (MULS8 mode).
30881 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
30882 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
30883 /// generate pmullw+zext32 for it (MULU8 mode).
30884 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
30885 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
30886 /// generate pmullw+pmulhw for it (MULS16 mode).
30887 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
30888 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
30889 /// generate pmullw+pmulhuw for it (MULU16 mode).
30890 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
30891 const X86Subtarget &Subtarget) {
30892 // Check for legality
30893 // pmullw/pmulhw are not supported by SSE.
30894 if (!Subtarget.hasSSE2())
30897 // Check for profitability
30898 // pmulld is supported since SSE41. It is better to use pmulld
30899 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
30901 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
30902 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
30906 if (!canReduceVMulWidth(N, DAG, Mode))
30910 SDValue N0 = N->getOperand(0);
30911 SDValue N1 = N->getOperand(1);
30912 EVT VT = N->getOperand(0).getValueType();
30913 unsigned RegSize = 128;
30914 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
30916 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
30917 // Shrink the operands of mul.
30918 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
30919 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
30921 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
30922 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
30923 // lower part is needed.
30924 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
30925 if (Mode == MULU8 || Mode == MULS8) {
30926 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
30929 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30930 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
30931 // the higher part is also needed.
30932 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30933 ReducedVT, NewN0, NewN1);
30935 // Repack the lower part and higher part result of mul into a wider
30937 // Generate shuffle functioning as punpcklwd.
30938 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
30939 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30940 ShuffleMask[2 * i] = i;
30941 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
30944 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30945 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
30946 // Generate shuffle functioning as punpckhwd.
30947 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30948 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
30949 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
30952 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30953 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
30954 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
30957 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
30958 // to legalize the mul explicitly because implicit legalization for type
30959 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
30960 // instructions which will not exist when we explicitly legalize it by
30961 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
30962 // <4 x i16> undef).
30964 // Legalize the operands of mul.
30965 // FIXME: We may be able to handle non-concatenated vectors by insertion.
30966 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
30967 if ((RegSize % ReducedSizeInBits) != 0)
30970 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
30971 DAG.getUNDEF(ReducedVT));
30973 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30975 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30977 if (Mode == MULU8 || Mode == MULS8) {
30978 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
30980 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30982 // convert the type of mul result to VT.
30983 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30984 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
30985 : ISD::SIGN_EXTEND_VECTOR_INREG,
30987 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30988 DAG.getIntPtrConstant(0, DL));
30990 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
30991 // MULU16/MULS16, both parts are needed.
30992 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30993 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30994 OpsVT, NewN0, NewN1);
30996 // Repack the lower part and higher part result of mul into a wider
30997 // result. Make sure the type of mul result is VT.
30998 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30999 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
31000 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
31001 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31002 DAG.getIntPtrConstant(0, DL));
31007 /// Optimize a single multiply with constant into two operations in order to
31008 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
31009 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
31010 TargetLowering::DAGCombinerInfo &DCI,
31011 const X86Subtarget &Subtarget) {
31012 EVT VT = N->getValueType(0);
31013 if (DCI.isBeforeLegalize() && VT.isVector())
31014 return reduceVMULWidth(N, DAG, Subtarget);
31016 // An imul is usually smaller than the alternative sequence.
31017 if (DAG.getMachineFunction().getFunction()->optForMinSize())
31020 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
31023 if (VT != MVT::i64 && VT != MVT::i32)
31026 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
31029 uint64_t MulAmt = C->getZExtValue();
31030 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
31033 uint64_t MulAmt1 = 0;
31034 uint64_t MulAmt2 = 0;
31035 if ((MulAmt % 9) == 0) {
31037 MulAmt2 = MulAmt / 9;
31038 } else if ((MulAmt % 5) == 0) {
31040 MulAmt2 = MulAmt / 5;
31041 } else if ((MulAmt % 3) == 0) {
31043 MulAmt2 = MulAmt / 3;
31049 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
31051 if (isPowerOf2_64(MulAmt2) &&
31052 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
31053 // If second multiplifer is pow2, issue it first. We want the multiply by
31054 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
31056 std::swap(MulAmt1, MulAmt2);
31058 if (isPowerOf2_64(MulAmt1))
31059 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31060 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
31062 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31063 DAG.getConstant(MulAmt1, DL, VT));
31065 if (isPowerOf2_64(MulAmt2))
31066 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
31067 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
31069 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
31070 DAG.getConstant(MulAmt2, DL, VT));
31074 assert(MulAmt != 0 &&
31075 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
31076 "Both cases that could cause potential overflows should have "
31077 "already been handled.");
31078 int64_t SignMulAmt = C->getSExtValue();
31079 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
31080 (SignMulAmt != -INT64_MAX)) {
31081 int NumSign = SignMulAmt > 0 ? 1 : -1;
31082 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
31083 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31084 if (IsPowerOf2_64PlusOne) {
31085 // (mul x, 2^N + 1) => (add (shl x, N), x)
31086 NewMul = DAG.getNode(
31087 ISD::ADD, DL, VT, N->getOperand(0),
31088 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31089 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31091 } else if (IsPowerOf2_64MinusOne) {
31092 // (mul x, 2^N - 1) => (sub (shl x, N), x)
31093 NewMul = DAG.getNode(
31095 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31096 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31100 // To negate, subtract the number from zero
31101 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
31103 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31108 // Do not add new nodes to DAG combiner worklist.
31109 DCI.CombineTo(N, NewMul, false);
31114 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31115 SDValue N0 = N->getOperand(0);
31116 SDValue N1 = N->getOperand(1);
31117 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31118 EVT VT = N0.getValueType();
31120 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31121 // since the result of setcc_c is all zero's or all ones.
31122 if (VT.isInteger() && !VT.isVector() &&
31123 N1C && N0.getOpcode() == ISD::AND &&
31124 N0.getOperand(1).getOpcode() == ISD::Constant) {
31125 SDValue N00 = N0.getOperand(0);
31126 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31127 Mask <<= N1C->getAPIntValue();
31128 bool MaskOK = false;
31129 // We can handle cases concerning bit-widening nodes containing setcc_c if
31130 // we carefully interrogate the mask to make sure we are semantics
31132 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31133 // of the underlying setcc_c operation if the setcc_c was zero extended.
31134 // Consider the following example:
31135 // zext(setcc_c) -> i32 0x0000FFFF
31136 // c1 -> i32 0x0000FFFF
31137 // c2 -> i32 0x00000001
31138 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31139 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
31140 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
31142 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
31143 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31145 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
31146 N00.getOpcode() == ISD::ANY_EXTEND) &&
31147 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31148 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31150 if (MaskOK && Mask != 0) {
31152 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31156 // Hardware support for vector shifts is sparse which makes us scalarize the
31157 // vector operations in many cases. Also, on sandybridge ADD is faster than
31159 // (shl V, 1) -> add V,V
31160 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31161 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31162 assert(N0.getValueType().isVector() && "Invalid vector shift type");
31163 // We shift all of the values by one. In many cases we do not have
31164 // hardware support for this operation. This is better expressed as an ADD
31166 if (N1SplatC->getAPIntValue() == 1)
31167 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31173 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31174 SDValue N0 = N->getOperand(0);
31175 SDValue N1 = N->getOperand(1);
31176 EVT VT = N0.getValueType();
31177 unsigned Size = VT.getSizeInBits();
31179 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31180 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31181 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31182 // depending on sign of (SarConst - [56,48,32,24,16])
31184 // sexts in X86 are MOVs. The MOVs have the same code size
31185 // as above SHIFTs (only SHIFT on 1 has lower code size).
31186 // However the MOVs have 2 advantages to a SHIFT:
31187 // 1. MOVs can write to a register that differs from source
31188 // 2. MOVs accept memory operands
31190 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
31191 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
31192 N0.getOperand(1).getOpcode() != ISD::Constant)
31195 SDValue N00 = N0.getOperand(0);
31196 SDValue N01 = N0.getOperand(1);
31197 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31198 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31199 EVT CVT = N1.getValueType();
31201 if (SarConst.isNegative())
31204 for (MVT SVT : MVT::integer_valuetypes()) {
31205 unsigned ShiftSize = SVT.getSizeInBits();
31206 // skipping types without corresponding sext/zext and
31207 // ShlConst that is not one of [56,48,32,24,16]
31208 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
31212 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31213 SarConst = SarConst - (Size - ShiftSize);
31216 else if (SarConst.isNegative())
31217 return DAG.getNode(ISD::SHL, DL, VT, NN,
31218 DAG.getConstant(-SarConst, DL, CVT));
31220 return DAG.getNode(ISD::SRA, DL, VT, NN,
31221 DAG.getConstant(SarConst, DL, CVT));
31226 /// \brief Returns a vector of 0s if the node in input is a vector logical
31227 /// shift by a constant amount which is known to be bigger than or equal
31228 /// to the vector element size in bits.
31229 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31230 const X86Subtarget &Subtarget) {
31231 EVT VT = N->getValueType(0);
31233 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31234 (!Subtarget.hasInt256() ||
31235 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31238 SDValue Amt = N->getOperand(1);
31240 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31241 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31242 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31243 unsigned MaxAmount =
31244 VT.getSimpleVT().getScalarSizeInBits();
31246 // SSE2/AVX2 logical shifts always return a vector of 0s
31247 // if the shift amount is bigger than or equal to
31248 // the element size. The constant shift amount will be
31249 // encoded as a 8-bit immediate.
31250 if (ShiftAmt.trunc(8).uge(MaxAmount))
31251 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31257 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31258 TargetLowering::DAGCombinerInfo &DCI,
31259 const X86Subtarget &Subtarget) {
31260 if (N->getOpcode() == ISD::SHL)
31261 if (SDValue V = combineShiftLeft(N, DAG))
31264 if (N->getOpcode() == ISD::SRA)
31265 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31268 // Try to fold this logical shift into a zero vector.
31269 if (N->getOpcode() != ISD::SRA)
31270 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31276 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31277 TargetLowering::DAGCombinerInfo &DCI,
31278 const X86Subtarget &Subtarget) {
31279 unsigned Opcode = N->getOpcode();
31280 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31281 X86ISD::VSRLI == Opcode) &&
31282 "Unexpected shift opcode");
31283 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31284 EVT VT = N->getValueType(0);
31285 SDValue N0 = N->getOperand(0);
31286 SDValue N1 = N->getOperand(1);
31287 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31288 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31289 "Unexpected value type");
31291 // Out of range logical bit shifts are guaranteed to be zero.
31292 // Out of range arithmetic bit shifts splat the sign bit.
31293 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31294 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31296 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31298 ShiftVal = NumBitsPerElt - 1;
31301 // Shift N0 by zero -> N0.
31305 // Shift zero -> zero.
31306 if (ISD::isBuildVectorAllZeros(N0.getNode()))
31307 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31309 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31310 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31311 // TODO - support other sra opcodes as needed.
31312 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31313 N0.getOpcode() == X86ISD::VSRAI)
31314 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31316 // We can decode 'whole byte' logical bit shifts as shuffles.
31317 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31319 SmallVector<int, 1> NonceMask; // Just a placeholder.
31320 NonceMask.push_back(0);
31321 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31322 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31324 return SDValue(); // This routine will use CombineTo to replace N.
31327 // Constant Folding.
31329 SmallVector<APInt, 32> EltBits;
31330 if (N->isOnlyUserOf(N0.getNode()) &&
31331 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31332 assert(EltBits.size() == VT.getVectorNumElements() &&
31333 "Unexpected shift value type");
31334 unsigned ShiftImm = ShiftVal.getZExtValue();
31335 for (APInt &Elt : EltBits) {
31336 if (X86ISD::VSHLI == Opcode)
31338 else if (X86ISD::VSRAI == Opcode)
31339 Elt.ashrInPlace(ShiftImm);
31341 Elt.lshrInPlace(ShiftImm);
31343 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31349 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31350 TargetLowering::DAGCombinerInfo &DCI,
31351 const X86Subtarget &Subtarget) {
31353 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31354 (N->getOpcode() == X86ISD::PINSRW &&
31355 N->getValueType(0) == MVT::v8i16)) &&
31356 "Unexpected vector insertion");
31358 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31360 SmallVector<int, 1> NonceMask; // Just a placeholder.
31361 NonceMask.push_back(0);
31362 combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31363 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31368 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31369 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31370 /// OR -> CMPNEQSS.
31371 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31372 TargetLowering::DAGCombinerInfo &DCI,
31373 const X86Subtarget &Subtarget) {
31376 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31377 // we're requiring SSE2 for both.
31378 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31379 SDValue N0 = N->getOperand(0);
31380 SDValue N1 = N->getOperand(1);
31381 SDValue CMP0 = N0->getOperand(1);
31382 SDValue CMP1 = N1->getOperand(1);
31385 // The SETCCs should both refer to the same CMP.
31386 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
31389 SDValue CMP00 = CMP0->getOperand(0);
31390 SDValue CMP01 = CMP0->getOperand(1);
31391 EVT VT = CMP00.getValueType();
31393 if (VT == MVT::f32 || VT == MVT::f64) {
31394 bool ExpectingFlags = false;
31395 // Check for any users that want flags:
31396 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31397 !ExpectingFlags && UI != UE; ++UI)
31398 switch (UI->getOpcode()) {
31403 ExpectingFlags = true;
31405 case ISD::CopyToReg:
31406 case ISD::SIGN_EXTEND:
31407 case ISD::ZERO_EXTEND:
31408 case ISD::ANY_EXTEND:
31412 if (!ExpectingFlags) {
31413 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31414 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31416 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
31417 X86::CondCode tmp = cc0;
31422 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
31423 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31424 // FIXME: need symbolic constants for these magic numbers.
31425 // See X86ATTInstPrinter.cpp:printSSECC().
31426 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31427 if (Subtarget.hasAVX512()) {
31429 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
31430 DAG.getConstant(x86cc, DL, MVT::i8));
31431 return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
31432 FSetCC, DAG.getIntPtrConstant(0, DL));
31434 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31435 CMP00.getValueType(), CMP00, CMP01,
31436 DAG.getConstant(x86cc, DL,
31439 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31440 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31442 if (is64BitFP && !Subtarget.is64Bit()) {
31443 // On a 32-bit target, we cannot bitcast the 64-bit float to a
31444 // 64-bit integer, since that's not a legal type. Since
31445 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31446 // bits, but can do this little dance to extract the lowest 32 bits
31447 // and work with those going forward.
31448 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31450 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31451 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31452 Vector32, DAG.getIntPtrConstant(0, DL));
31456 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31457 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31458 DAG.getConstant(1, DL, IntVT));
31459 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31461 return OneBitOfTruth;
31469 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31470 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31471 assert(N->getOpcode() == ISD::AND);
31473 EVT VT = N->getValueType(0);
31474 SDValue N0 = N->getOperand(0);
31475 SDValue N1 = N->getOperand(1);
31478 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31481 if (N0.getOpcode() == ISD::XOR &&
31482 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31483 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31485 if (N1.getOpcode() == ISD::XOR &&
31486 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31487 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31492 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31493 // register. In most cases we actually compare or select YMM-sized registers
31494 // and mixing the two types creates horrible code. This method optimizes
31495 // some of the transition sequences.
31496 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31497 TargetLowering::DAGCombinerInfo &DCI,
31498 const X86Subtarget &Subtarget) {
31499 EVT VT = N->getValueType(0);
31500 if (!VT.is256BitVector())
31503 assert((N->getOpcode() == ISD::ANY_EXTEND ||
31504 N->getOpcode() == ISD::ZERO_EXTEND ||
31505 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
31507 SDValue Narrow = N->getOperand(0);
31508 EVT NarrowVT = Narrow->getValueType(0);
31509 if (!NarrowVT.is128BitVector())
31512 if (Narrow->getOpcode() != ISD::XOR &&
31513 Narrow->getOpcode() != ISD::AND &&
31514 Narrow->getOpcode() != ISD::OR)
31517 SDValue N0 = Narrow->getOperand(0);
31518 SDValue N1 = Narrow->getOperand(1);
31521 // The Left side has to be a trunc.
31522 if (N0.getOpcode() != ISD::TRUNCATE)
31525 // The type of the truncated inputs.
31526 EVT WideVT = N0->getOperand(0)->getValueType(0);
31530 // The right side has to be a 'trunc' or a constant vector.
31531 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
31532 ConstantSDNode *RHSConstSplat = nullptr;
31533 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
31534 RHSConstSplat = RHSBV->getConstantSplatNode();
31535 if (!RHSTrunc && !RHSConstSplat)
31538 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31540 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
31543 // Set N0 and N1 to hold the inputs to the new wide operation.
31544 N0 = N0->getOperand(0);
31545 if (RHSConstSplat) {
31546 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
31547 SDValue(RHSConstSplat, 0));
31548 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
31549 } else if (RHSTrunc) {
31550 N1 = N1->getOperand(0);
31553 // Generate the wide operation.
31554 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
31555 unsigned Opcode = N->getOpcode();
31557 case ISD::ANY_EXTEND:
31559 case ISD::ZERO_EXTEND: {
31560 unsigned InBits = NarrowVT.getScalarSizeInBits();
31561 APInt Mask = APInt::getAllOnesValue(InBits);
31562 Mask = Mask.zext(VT.getScalarSizeInBits());
31563 return DAG.getNode(ISD::AND, DL, VT,
31564 Op, DAG.getConstant(Mask, DL, VT));
31566 case ISD::SIGN_EXTEND:
31567 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
31568 Op, DAG.getValueType(NarrowVT));
31570 llvm_unreachable("Unexpected opcode");
31574 /// If both input operands of a logic op are being cast from floating point
31575 /// types, try to convert this into a floating point logic node to avoid
31576 /// unnecessary moves from SSE to integer registers.
31577 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
31578 const X86Subtarget &Subtarget) {
31579 unsigned FPOpcode = ISD::DELETED_NODE;
31580 if (N->getOpcode() == ISD::AND)
31581 FPOpcode = X86ISD::FAND;
31582 else if (N->getOpcode() == ISD::OR)
31583 FPOpcode = X86ISD::FOR;
31584 else if (N->getOpcode() == ISD::XOR)
31585 FPOpcode = X86ISD::FXOR;
31587 assert(FPOpcode != ISD::DELETED_NODE &&
31588 "Unexpected input node for FP logic conversion");
31590 EVT VT = N->getValueType(0);
31591 SDValue N0 = N->getOperand(0);
31592 SDValue N1 = N->getOperand(1);
31594 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
31595 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
31596 (Subtarget.hasSSE2() && VT == MVT::i64))) {
31597 SDValue N00 = N0.getOperand(0);
31598 SDValue N10 = N1.getOperand(0);
31599 EVT N00Type = N00.getValueType();
31600 EVT N10Type = N10.getValueType();
31601 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
31602 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
31603 return DAG.getBitcast(VT, FPLogic);
31609 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
31610 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
31611 /// with a shift-right to eliminate loading the vector constant mask value.
31612 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
31613 const X86Subtarget &Subtarget) {
31614 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
31615 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
31616 EVT VT0 = Op0.getValueType();
31617 EVT VT1 = Op1.getValueType();
31619 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
31623 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
31624 !SplatVal.isMask())
31627 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
31630 unsigned EltBitWidth = VT0.getScalarSizeInBits();
31631 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
31635 unsigned ShiftVal = SplatVal.countTrailingOnes();
31636 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
31637 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
31638 return DAG.getBitcast(N->getValueType(0), Shift);
31641 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
31642 TargetLowering::DAGCombinerInfo &DCI,
31643 const X86Subtarget &Subtarget) {
31644 if (DCI.isBeforeLegalizeOps())
31647 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31650 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31653 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
31656 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
31659 EVT VT = N->getValueType(0);
31660 SDValue N0 = N->getOperand(0);
31661 SDValue N1 = N->getOperand(1);
31664 // Attempt to recursively combine a bitmask AND with shuffles.
31665 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
31667 SmallVector<int, 1> NonceMask; // Just a placeholder.
31668 NonceMask.push_back(0);
31669 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31670 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31672 return SDValue(); // This routine will use CombineTo to replace N.
31675 // Create BEXTR instructions
31676 // BEXTR is ((X >> imm) & (2**size-1))
31677 if (VT != MVT::i32 && VT != MVT::i64)
31680 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
31682 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
31685 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
31686 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
31687 if (MaskNode && ShiftNode) {
31688 uint64_t Mask = MaskNode->getZExtValue();
31689 uint64_t Shift = ShiftNode->getZExtValue();
31690 if (isMask_64(Mask)) {
31691 uint64_t MaskSize = countPopulation(Mask);
31692 if (Shift + MaskSize <= VT.getSizeInBits())
31693 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
31694 DAG.getConstant(Shift | (MaskSize << 8), DL,
31702 // (or (and (m, y), (pandn m, x)))
31704 // (vselect m, x, y)
31705 // As a special case, try to fold:
31706 // (or (and (m, (sub 0, x)), (pandn m, x)))
31708 // (sub (xor X, M), M)
31709 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
31710 const X86Subtarget &Subtarget) {
31711 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
31713 SDValue N0 = N->getOperand(0);
31714 SDValue N1 = N->getOperand(1);
31715 EVT VT = N->getValueType(0);
31717 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
31718 (VT.is256BitVector() && Subtarget.hasInt256())))
31721 // Canonicalize AND to LHS.
31722 if (N1.getOpcode() == ISD::AND)
31725 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
31726 // ANDNP combine allows other combines to happen that prevent matching.
31727 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
31730 SDValue Mask = N1.getOperand(0);
31731 SDValue X = N1.getOperand(1);
31733 if (N0.getOperand(0) == Mask)
31734 Y = N0.getOperand(1);
31735 if (N0.getOperand(1) == Mask)
31736 Y = N0.getOperand(0);
31738 // Check to see if the mask appeared in both the AND and ANDNP.
31742 // Validate that X, Y, and Mask are bitcasts, and see through them.
31743 Mask = peekThroughBitcasts(Mask);
31744 X = peekThroughBitcasts(X);
31745 Y = peekThroughBitcasts(Y);
31747 EVT MaskVT = Mask.getValueType();
31748 unsigned EltBits = MaskVT.getScalarSizeInBits();
31750 // TODO: Attempt to handle floating point cases as well?
31751 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
31757 // (or (and (M, (sub 0, X)), (pandn M, X)))
31758 // which is a special case of vselect:
31759 // (vselect M, (sub 0, X), X)
31761 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
31762 // We know that, if fNegate is 0 or 1:
31763 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
31765 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
31766 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
31767 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
31768 // This lets us transform our vselect to:
31769 // (add (xor X, M), (and M, 1))
31771 // (sub (xor X, M), M)
31772 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
31773 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
31774 auto IsNegV = [](SDNode *N, SDValue V) {
31775 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
31776 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
31779 if (IsNegV(Y.getNode(), X))
31781 else if (IsNegV(X.getNode(), Y))
31785 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
31786 SDValue SubOp2 = Mask;
31788 // If the negate was on the false side of the select, then
31789 // the operands of the SUB need to be swapped. PR 27251.
31790 // This is because the pattern being matched above is
31791 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
31792 // but if the pattern matched was
31793 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
31794 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
31795 // pattern also needs to be a negation of the replacement pattern above.
31796 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
31797 // sub accomplishes the negation of the replacement pattern.
31799 std::swap(SubOp1, SubOp2);
31801 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
31802 return DAG.getBitcast(VT, Res);
31806 // PBLENDVB is only available on SSE 4.1.
31807 if (!Subtarget.hasSSE41())
31810 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
31812 X = DAG.getBitcast(BlendVT, X);
31813 Y = DAG.getBitcast(BlendVT, Y);
31814 Mask = DAG.getBitcast(BlendVT, Mask);
31815 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
31816 return DAG.getBitcast(VT, Mask);
31819 // Helper function for combineOrCmpEqZeroToCtlzSrl
31823 // srl(ctlz x), log2(bitsize(x))
31824 // Input pattern is checked by caller.
31825 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
31826 SelectionDAG &DAG) {
31827 SDValue Cmp = Op.getOperand(1);
31828 EVT VT = Cmp.getOperand(0).getValueType();
31829 unsigned Log2b = Log2_32(VT.getSizeInBits());
31831 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
31832 // The result of the shift is true or false, and on X86, the 32-bit
31833 // encoding of shr and lzcnt is more desirable.
31834 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
31835 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
31836 DAG.getConstant(Log2b, dl, VT));
31837 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
31840 // Try to transform:
31841 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
31843 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
31844 // Will also attempt to match more generic cases, eg:
31845 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
31846 // Only applies if the target supports the FastLZCNT feature.
31847 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
31848 TargetLowering::DAGCombinerInfo &DCI,
31849 const X86Subtarget &Subtarget) {
31850 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
31853 auto isORCandidate = [](SDValue N) {
31854 return (N->getOpcode() == ISD::OR && N->hasOneUse());
31857 // Check the zero extend is extending to 32-bit or more. The code generated by
31858 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
31859 // instructions to clear the upper bits.
31860 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
31861 !isORCandidate(N->getOperand(0)))
31864 // Check the node matches: setcc(eq, cmp 0)
31865 auto isSetCCCandidate = [](SDValue N) {
31866 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
31867 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
31868 N->getOperand(1).getOpcode() == X86ISD::CMP &&
31869 isNullConstant(N->getOperand(1).getOperand(1)) &&
31870 N->getOperand(1).getValueType().bitsGE(MVT::i32);
31873 SDNode *OR = N->getOperand(0).getNode();
31874 SDValue LHS = OR->getOperand(0);
31875 SDValue RHS = OR->getOperand(1);
31877 // Save nodes matching or(or, setcc(eq, cmp 0)).
31878 SmallVector<SDNode *, 2> ORNodes;
31879 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
31880 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
31881 ORNodes.push_back(OR);
31882 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
31883 LHS = OR->getOperand(0);
31884 RHS = OR->getOperand(1);
31887 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
31888 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
31889 !isORCandidate(SDValue(OR, 0)))
31892 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
31894 // or(srl(ctlz),srl(ctlz)).
31895 // The dag combiner can then fold it into:
31896 // srl(or(ctlz, ctlz)).
31897 EVT VT = OR->getValueType(0);
31898 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
31899 SDValue Ret, NewRHS;
31900 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
31901 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
31906 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
31907 while (ORNodes.size() > 0) {
31908 OR = ORNodes.pop_back_val();
31909 LHS = OR->getOperand(0);
31910 RHS = OR->getOperand(1);
31911 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
31912 if (RHS->getOpcode() == ISD::OR)
31913 std::swap(LHS, RHS);
31914 EVT VT = OR->getValueType(0);
31915 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
31918 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
31922 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
31927 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
31928 TargetLowering::DAGCombinerInfo &DCI,
31929 const X86Subtarget &Subtarget) {
31930 if (DCI.isBeforeLegalizeOps())
31933 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31936 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31939 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
31942 SDValue N0 = N->getOperand(0);
31943 SDValue N1 = N->getOperand(1);
31944 EVT VT = N->getValueType(0);
31946 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
31949 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
31950 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
31952 // SHLD/SHRD instructions have lower register pressure, but on some
31953 // platforms they have higher latency than the equivalent
31954 // series of shifts/or that would otherwise be generated.
31955 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
31956 // have higher latencies and we are not optimizing for size.
31957 if (!OptForSize && Subtarget.isSHLDSlow())
31960 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
31962 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
31964 if (!N0.hasOneUse() || !N1.hasOneUse())
31967 SDValue ShAmt0 = N0.getOperand(1);
31968 if (ShAmt0.getValueType() != MVT::i8)
31970 SDValue ShAmt1 = N1.getOperand(1);
31971 if (ShAmt1.getValueType() != MVT::i8)
31973 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
31974 ShAmt0 = ShAmt0.getOperand(0);
31975 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
31976 ShAmt1 = ShAmt1.getOperand(0);
31979 unsigned Opc = X86ISD::SHLD;
31980 SDValue Op0 = N0.getOperand(0);
31981 SDValue Op1 = N1.getOperand(0);
31982 if (ShAmt0.getOpcode() == ISD::SUB ||
31983 ShAmt0.getOpcode() == ISD::XOR) {
31984 Opc = X86ISD::SHRD;
31985 std::swap(Op0, Op1);
31986 std::swap(ShAmt0, ShAmt1);
31989 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
31990 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
31991 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
31992 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
31993 unsigned Bits = VT.getSizeInBits();
31994 if (ShAmt1.getOpcode() == ISD::SUB) {
31995 SDValue Sum = ShAmt1.getOperand(0);
31996 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
31997 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
31998 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
31999 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
32000 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
32001 return DAG.getNode(Opc, DL, VT,
32003 DAG.getNode(ISD::TRUNCATE, DL,
32006 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
32007 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
32008 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
32009 return DAG.getNode(Opc, DL, VT,
32010 N0.getOperand(0), N1.getOperand(0),
32011 DAG.getNode(ISD::TRUNCATE, DL,
32013 } else if (ShAmt1.getOpcode() == ISD::XOR) {
32014 SDValue Mask = ShAmt1.getOperand(1);
32015 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
32016 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
32017 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
32018 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
32019 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
32020 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
32021 if (Op1.getOpcode() == InnerShift &&
32022 isa<ConstantSDNode>(Op1.getOperand(1)) &&
32023 Op1.getConstantOperandVal(1) == 1) {
32024 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32025 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32027 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
32028 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
32029 Op1.getOperand(0) == Op1.getOperand(1)) {
32030 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32031 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32040 /// Generate NEG and CMOV for integer abs.
32041 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
32042 EVT VT = N->getValueType(0);
32044 // Since X86 does not have CMOV for 8-bit integer, we don't convert
32045 // 8-bit integer abs to NEG and CMOV.
32046 if (VT.isInteger() && VT.getSizeInBits() == 8)
32049 SDValue N0 = N->getOperand(0);
32050 SDValue N1 = N->getOperand(1);
32053 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
32054 // and change it to SUB and CMOV.
32055 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
32056 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
32057 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
32058 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
32059 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
32060 // Generate SUB & CMOV.
32061 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
32062 DAG.getConstant(0, DL, VT), N0.getOperand(0));
32063 SDValue Ops[] = {N0.getOperand(0), Neg,
32064 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
32065 SDValue(Neg.getNode(), 1)};
32066 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
32072 /// Try to turn tests against the signbit in the form of:
32073 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
32076 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
32077 // This is only worth doing if the output type is i8 or i1.
32078 EVT ResultType = N->getValueType(0);
32079 if (ResultType != MVT::i8 && ResultType != MVT::i1)
32082 SDValue N0 = N->getOperand(0);
32083 SDValue N1 = N->getOperand(1);
32085 // We should be performing an xor against a truncated shift.
32086 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
32089 // Make sure we are performing an xor against one.
32090 if (!isOneConstant(N1))
32093 // SetCC on x86 zero extends so only act on this if it's a logical shift.
32094 SDValue Shift = N0.getOperand(0);
32095 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
32098 // Make sure we are truncating from one of i16, i32 or i64.
32099 EVT ShiftTy = Shift.getValueType();
32100 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
32103 // Make sure the shift amount extracts the sign bit.
32104 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
32105 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32108 // Create a greater-than comparison against -1.
32109 // N.B. Using SETGE against 0 works but we want a canonical looking
32110 // comparison, using SETGT matches up with what TranslateX86CC.
32112 SDValue ShiftOp = Shift.getOperand(0);
32113 EVT ShiftOpTy = ShiftOp.getValueType();
32114 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32115 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32116 *DAG.getContext(), ResultType);
32117 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32118 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32119 if (SetCCResultType != ResultType)
32120 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32124 /// Turn vector tests of the signbit in the form of:
32125 /// xor (sra X, elt_size(X)-1), -1
32129 /// This should be called before type legalization because the pattern may not
32130 /// persist after that.
32131 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32132 const X86Subtarget &Subtarget) {
32133 EVT VT = N->getValueType(0);
32134 if (!VT.isSimple())
32137 switch (VT.getSimpleVT().SimpleTy) {
32138 default: return SDValue();
32141 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32142 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32146 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32149 // There must be a shift right algebraic before the xor, and the xor must be a
32150 // 'not' operation.
32151 SDValue Shift = N->getOperand(0);
32152 SDValue Ones = N->getOperand(1);
32153 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
32154 !ISD::isBuildVectorAllOnes(Ones.getNode()))
32157 // The shift should be smearing the sign bit across each vector element.
32158 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32162 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32163 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32164 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32167 // Create a greater-than comparison against -1. We don't use the more obvious
32168 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32169 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32172 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
32173 /// is valid for the given \p Subtarget.
32174 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32175 const X86Subtarget &Subtarget) {
32176 if (!Subtarget.hasAVX512())
32179 // FIXME: Scalar type may be supported if we move it to vector register.
32180 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
32183 EVT SrcElVT = SrcVT.getScalarType();
32184 EVT DstElVT = DstVT.getScalarType();
32185 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
32187 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
32189 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
32190 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
32194 /// Detect a pattern of truncation with saturation:
32195 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32196 /// Return the source value to be truncated or SDValue() if the pattern was not
32198 static SDValue detectUSatPattern(SDValue In, EVT VT) {
32199 if (In.getOpcode() != ISD::UMIN)
32202 //Saturation with truncation. We truncate from InVT to VT.
32203 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32204 "Unexpected types for truncate operation");
32207 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
32208 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32209 // the element size of the destination type.
32210 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32216 /// Detect a pattern of truncation with saturation:
32217 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32218 /// The types should allow to use VPMOVUS* instruction on AVX512.
32219 /// Return the source value to be truncated or SDValue() if the pattern was not
32221 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32222 const X86Subtarget &Subtarget) {
32223 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32225 return detectUSatPattern(In, VT);
32229 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32230 const X86Subtarget &Subtarget) {
32231 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32232 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
32234 if (auto USatVal = detectUSatPattern(In, VT))
32235 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32236 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32240 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
32241 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32242 /// X86ISD::AVG instruction.
32243 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32244 const X86Subtarget &Subtarget,
32246 if (!VT.isVector() || !VT.isSimple())
32248 EVT InVT = In.getValueType();
32249 unsigned NumElems = VT.getVectorNumElements();
32251 EVT ScalarVT = VT.getVectorElementType();
32252 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
32253 isPowerOf2_32(NumElems)))
32256 // InScalarVT is the intermediate type in AVG pattern and it should be greater
32257 // than the original input type (i8/i16).
32258 EVT InScalarVT = InVT.getVectorElementType();
32259 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32262 if (!Subtarget.hasSSE2())
32264 if (Subtarget.hasBWI()) {
32265 if (VT.getSizeInBits() > 512)
32267 } else if (Subtarget.hasAVX2()) {
32268 if (VT.getSizeInBits() > 256)
32271 if (VT.getSizeInBits() > 128)
32275 // Detect the following pattern:
32277 // %1 = zext <N x i8> %a to <N x i32>
32278 // %2 = zext <N x i8> %b to <N x i32>
32279 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32280 // %4 = add nuw nsw <N x i32> %3, %2
32281 // %5 = lshr <N x i32> %N, <i32 1 x N>
32282 // %6 = trunc <N x i32> %5 to <N x i8>
32284 // In AVX512, the last instruction can also be a trunc store.
32286 if (In.getOpcode() != ISD::SRL)
32289 // A lambda checking the given SDValue is a constant vector and each element
32290 // is in the range [Min, Max].
32291 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32292 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32293 if (!BV || !BV->isConstant())
32295 for (SDValue Op : V->ops()) {
32296 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32299 uint64_t Val = C->getZExtValue();
32300 if (Val < Min || Val > Max)
32306 // Check if each element of the vector is left-shifted by one.
32307 auto LHS = In.getOperand(0);
32308 auto RHS = In.getOperand(1);
32309 if (!IsConstVectorInRange(RHS, 1, 1))
32311 if (LHS.getOpcode() != ISD::ADD)
32314 // Detect a pattern of a + b + 1 where the order doesn't matter.
32315 SDValue Operands[3];
32316 Operands[0] = LHS.getOperand(0);
32317 Operands[1] = LHS.getOperand(1);
32319 // Take care of the case when one of the operands is a constant vector whose
32320 // element is in the range [1, 256].
32321 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32322 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32323 Operands[0].getOperand(0).getValueType() == VT) {
32324 // The pattern is detected. Subtract one from the constant vector, then
32325 // demote it and emit X86ISD::AVG instruction.
32326 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32327 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32328 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32329 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32333 if (Operands[0].getOpcode() == ISD::ADD)
32334 std::swap(Operands[0], Operands[1]);
32335 else if (Operands[1].getOpcode() != ISD::ADD)
32337 Operands[2] = Operands[1].getOperand(0);
32338 Operands[1] = Operands[1].getOperand(1);
32340 // Now we have three operands of two additions. Check that one of them is a
32341 // constant vector with ones, and the other two are promoted from i8/i16.
32342 for (int i = 0; i < 3; ++i) {
32343 if (!IsConstVectorInRange(Operands[i], 1, 1))
32345 std::swap(Operands[i], Operands[2]);
32347 // Check if Operands[0] and Operands[1] are results of type promotion.
32348 for (int j = 0; j < 2; ++j)
32349 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32350 Operands[j].getOperand(0).getValueType() != VT)
32353 // The pattern is detected, emit X86ISD::AVG instruction.
32354 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32355 Operands[1].getOperand(0));
32361 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32362 TargetLowering::DAGCombinerInfo &DCI,
32363 const X86Subtarget &Subtarget) {
32364 LoadSDNode *Ld = cast<LoadSDNode>(N);
32365 EVT RegVT = Ld->getValueType(0);
32366 EVT MemVT = Ld->getMemoryVT();
32368 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32370 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32371 // into two 16-byte operations.
32372 ISD::LoadExtType Ext = Ld->getExtensionType();
32374 unsigned AddressSpace = Ld->getAddressSpace();
32375 unsigned Alignment = Ld->getAlignment();
32376 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32377 Ext == ISD::NON_EXTLOAD &&
32378 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32379 AddressSpace, Alignment, &Fast) && !Fast) {
32380 unsigned NumElems = RegVT.getVectorNumElements();
32384 SDValue Ptr = Ld->getBasePtr();
32386 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32389 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32390 Alignment, Ld->getMemOperand()->getFlags());
32392 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32394 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32395 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32396 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32398 Load2.getValue(1));
32400 SDValue NewVec = DAG.getUNDEF(RegVT);
32401 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32402 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32403 return DCI.CombineTo(N, NewVec, TF, true);
32409 /// If V is a build vector of boolean constants and exactly one of those
32410 /// constants is true, return the operand index of that true element.
32411 /// Otherwise, return -1.
32412 static int getOneTrueElt(SDValue V) {
32413 // This needs to be a build vector of booleans.
32414 // TODO: Checking for the i1 type matches the IR definition for the mask,
32415 // but the mask check could be loosened to i8 or other types. That might
32416 // also require checking more than 'allOnesValue'; eg, the x86 HW
32417 // instructions only require that the MSB is set for each mask element.
32418 // The ISD::MSTORE comments/definition do not specify how the mask operand
32420 auto *BV = dyn_cast<BuildVectorSDNode>(V);
32421 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
32424 int TrueIndex = -1;
32425 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32426 for (unsigned i = 0; i < NumElts; ++i) {
32427 const SDValue &Op = BV->getOperand(i);
32430 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32433 if (ConstNode->getAPIntValue().isAllOnesValue()) {
32434 // If we already found a one, this is too many.
32435 if (TrueIndex >= 0)
32443 /// Given a masked memory load/store operation, return true if it has one mask
32444 /// bit set. If it has one mask bit set, then also return the memory address of
32445 /// the scalar element to load/store, the vector index to insert/extract that
32446 /// scalar element, and the alignment for the scalar memory access.
32447 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32448 SelectionDAG &DAG, SDValue &Addr,
32449 SDValue &Index, unsigned &Alignment) {
32450 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32451 if (TrueMaskElt < 0)
32454 // Get the address of the one scalar element that is specified by the mask
32455 // using the appropriate offset from the base pointer.
32456 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32457 Addr = MaskedOp->getBasePtr();
32458 if (TrueMaskElt != 0) {
32459 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32460 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32463 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32464 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32468 /// If exactly one element of the mask is set for a non-extending masked load,
32469 /// it is a scalar load and vector insert.
32470 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32471 /// mask have already been optimized in IR, so we don't bother with those here.
32473 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32474 TargetLowering::DAGCombinerInfo &DCI) {
32475 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32476 // However, some target hooks may need to be added to know when the transform
32477 // is profitable. Endianness would also have to be considered.
32479 SDValue Addr, VecIndex;
32480 unsigned Alignment;
32481 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32484 // Load the one scalar element that is specified by the mask using the
32485 // appropriate offset from the base pointer.
32487 EVT VT = ML->getValueType(0);
32488 EVT EltVT = VT.getVectorElementType();
32490 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32491 Alignment, ML->getMemOperand()->getFlags());
32493 // Insert the loaded element into the appropriate place in the vector.
32494 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32496 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32500 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32501 TargetLowering::DAGCombinerInfo &DCI) {
32502 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32506 EVT VT = ML->getValueType(0);
32508 // If we are loading the first and last elements of a vector, it is safe and
32509 // always faster to load the whole vector. Replace the masked load with a
32510 // vector load and select.
32511 unsigned NumElts = VT.getVectorNumElements();
32512 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
32513 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
32514 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
32515 if (LoadFirstElt && LoadLastElt) {
32516 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32517 ML->getMemOperand());
32518 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
32519 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
32522 // Convert a masked load with a constant mask into a masked load and a select.
32523 // This allows the select operation to use a faster kind of select instruction
32524 // (for example, vblendvps -> vblendps).
32526 // Don't try this if the pass-through operand is already undefined. That would
32527 // cause an infinite loop because that's what we're about to create.
32528 if (ML->getSrc0().isUndef())
32531 // The new masked load has an undef pass-through operand. The select uses the
32532 // original pass-through operand.
32533 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32534 ML->getMask(), DAG.getUNDEF(VT),
32535 ML->getMemoryVT(), ML->getMemOperand(),
32536 ML->getExtensionType());
32537 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
32539 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
32542 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
32543 TargetLowering::DAGCombinerInfo &DCI,
32544 const X86Subtarget &Subtarget) {
32545 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
32547 // TODO: Expanding load with constant mask may be optimized as well.
32548 if (Mld->isExpandingLoad())
32551 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
32552 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
32554 // TODO: Do some AVX512 subsets benefit from this transform?
32555 if (!Subtarget.hasAVX512())
32556 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
32560 if (Mld->getExtensionType() != ISD::SEXTLOAD)
32563 // Resolve extending loads.
32564 EVT VT = Mld->getValueType(0);
32565 unsigned NumElems = VT.getVectorNumElements();
32566 EVT LdVT = Mld->getMemoryVT();
32569 assert(LdVT != VT && "Cannot extend to the same type");
32570 unsigned ToSz = VT.getScalarSizeInBits();
32571 unsigned FromSz = LdVT.getScalarSizeInBits();
32572 // From/To sizes and ElemCount must be pow of two.
32573 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32574 "Unexpected size for extending masked load");
32576 unsigned SizeRatio = ToSz / FromSz;
32577 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
32579 // Create a type on which we perform the shuffle.
32580 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32581 LdVT.getScalarType(), NumElems*SizeRatio);
32582 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32584 // Convert Src0 value.
32585 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
32586 if (!Mld->getSrc0().isUndef()) {
32587 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32588 for (unsigned i = 0; i != NumElems; ++i)
32589 ShuffleVec[i] = i * SizeRatio;
32591 // Can't shuffle using an illegal type.
32592 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32593 "WideVecVT should be legal");
32594 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
32595 DAG.getUNDEF(WideVecVT), ShuffleVec);
32597 // Prepare the new mask.
32599 SDValue Mask = Mld->getMask();
32600 if (Mask.getValueType() == VT) {
32601 // Mask and original value have the same type.
32602 NewMask = DAG.getBitcast(WideVecVT, Mask);
32603 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32604 for (unsigned i = 0; i != NumElems; ++i)
32605 ShuffleVec[i] = i * SizeRatio;
32606 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
32607 ShuffleVec[i] = NumElems * SizeRatio;
32608 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32609 DAG.getConstant(0, dl, WideVecVT),
32612 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32613 unsigned WidenNumElts = NumElems*SizeRatio;
32614 unsigned MaskNumElts = VT.getVectorNumElements();
32615 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32618 unsigned NumConcat = WidenNumElts / MaskNumElts;
32619 SmallVector<SDValue, 16> Ops(NumConcat);
32620 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32622 for (unsigned i = 1; i != NumConcat; ++i)
32625 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32628 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
32629 Mld->getBasePtr(), NewMask, WideSrc0,
32630 Mld->getMemoryVT(), Mld->getMemOperand(),
32632 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
32633 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
32636 /// If exactly one element of the mask is set for a non-truncating masked store,
32637 /// it is a vector extract and scalar store.
32638 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32639 /// mask have already been optimized in IR, so we don't bother with those here.
32640 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
32641 SelectionDAG &DAG) {
32642 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32643 // However, some target hooks may need to be added to know when the transform
32644 // is profitable. Endianness would also have to be considered.
32646 SDValue Addr, VecIndex;
32647 unsigned Alignment;
32648 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
32651 // Extract the one scalar element that is actually being stored.
32653 EVT VT = MS->getValue().getValueType();
32654 EVT EltVT = VT.getVectorElementType();
32655 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
32656 MS->getValue(), VecIndex);
32658 // Store that element at the appropriate offset from the base pointer.
32659 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
32660 Alignment, MS->getMemOperand()->getFlags());
32663 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
32664 const X86Subtarget &Subtarget) {
32665 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
32667 if (Mst->isCompressingStore())
32670 if (!Mst->isTruncatingStore())
32671 return reduceMaskedStoreToScalarStore(Mst, DAG);
32673 // Resolve truncating stores.
32674 EVT VT = Mst->getValue().getValueType();
32675 unsigned NumElems = VT.getVectorNumElements();
32676 EVT StVT = Mst->getMemoryVT();
32679 assert(StVT != VT && "Cannot truncate to the same type");
32680 unsigned FromSz = VT.getScalarSizeInBits();
32681 unsigned ToSz = StVT.getScalarSizeInBits();
32683 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32685 // The truncating store is legal in some cases. For example
32686 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32687 // are designated for truncate store.
32688 // In this case we don't need any further transformations.
32689 if (TLI.isTruncStoreLegal(VT, StVT))
32692 // From/To sizes and ElemCount must be pow of two.
32693 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32694 "Unexpected size for truncating masked store");
32695 // We are going to use the original vector elt for storing.
32696 // Accumulated smaller vector elements must be a multiple of the store size.
32697 assert (((NumElems * FromSz) % ToSz) == 0 &&
32698 "Unexpected ratio for truncating masked store");
32700 unsigned SizeRatio = FromSz / ToSz;
32701 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32703 // Create a type on which we perform the shuffle.
32704 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32705 StVT.getScalarType(), NumElems*SizeRatio);
32707 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32709 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
32710 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32711 for (unsigned i = 0; i != NumElems; ++i)
32712 ShuffleVec[i] = i * SizeRatio;
32714 // Can't shuffle using an illegal type.
32715 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32716 "WideVecVT should be legal");
32718 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32719 DAG.getUNDEF(WideVecVT),
32723 SDValue Mask = Mst->getMask();
32724 if (Mask.getValueType() == VT) {
32725 // Mask and original value have the same type.
32726 NewMask = DAG.getBitcast(WideVecVT, Mask);
32727 for (unsigned i = 0; i != NumElems; ++i)
32728 ShuffleVec[i] = i * SizeRatio;
32729 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
32730 ShuffleVec[i] = NumElems*SizeRatio;
32731 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32732 DAG.getConstant(0, dl, WideVecVT),
32735 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32736 unsigned WidenNumElts = NumElems*SizeRatio;
32737 unsigned MaskNumElts = VT.getVectorNumElements();
32738 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32741 unsigned NumConcat = WidenNumElts / MaskNumElts;
32742 SmallVector<SDValue, 16> Ops(NumConcat);
32743 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32745 for (unsigned i = 1; i != NumConcat; ++i)
32748 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32751 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
32752 Mst->getBasePtr(), NewMask, StVT,
32753 Mst->getMemOperand(), false);
32756 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
32757 const X86Subtarget &Subtarget) {
32758 StoreSDNode *St = cast<StoreSDNode>(N);
32759 EVT VT = St->getValue().getValueType();
32760 EVT StVT = St->getMemoryVT();
32762 SDValue StoredVal = St->getOperand(1);
32763 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32765 // If we are saving a concatenation of two XMM registers and 32-byte stores
32766 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
32768 unsigned AddressSpace = St->getAddressSpace();
32769 unsigned Alignment = St->getAlignment();
32770 if (VT.is256BitVector() && StVT == VT &&
32771 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
32772 AddressSpace, Alignment, &Fast) &&
32774 unsigned NumElems = VT.getVectorNumElements();
32778 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
32779 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
32781 SDValue Ptr0 = St->getBasePtr();
32782 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
32785 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
32786 Alignment, St->getMemOperand()->getFlags());
32788 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
32789 std::min(16U, Alignment), St->getMemOperand()->getFlags());
32790 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
32793 // Optimize trunc store (of multiple scalars) to shuffle and store.
32794 // First, pack all of the elements in one place. Next, store to memory
32795 // in fewer chunks.
32796 if (St->isTruncatingStore() && VT.isVector()) {
32797 // Check if we can detect an AVG pattern from the truncation. If yes,
32798 // replace the trunc store by a normal store with the result of X86ISD::AVG
32800 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
32802 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
32803 St->getPointerInfo(), St->getAlignment(),
32804 St->getMemOperand()->getFlags());
32807 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
32808 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
32809 dl, Val, St->getBasePtr(),
32810 St->getMemoryVT(), St->getMemOperand(), DAG);
32812 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32813 unsigned NumElems = VT.getVectorNumElements();
32814 assert(StVT != VT && "Cannot truncate to the same type");
32815 unsigned FromSz = VT.getScalarSizeInBits();
32816 unsigned ToSz = StVT.getScalarSizeInBits();
32818 // The truncating store is legal in some cases. For example
32819 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32820 // are designated for truncate store.
32821 // In this case we don't need any further transformations.
32822 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
32825 // From, To sizes and ElemCount must be pow of two
32826 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
32827 // We are going to use the original vector elt for storing.
32828 // Accumulated smaller vector elements must be a multiple of the store size.
32829 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
32831 unsigned SizeRatio = FromSz / ToSz;
32833 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32835 // Create a type on which we perform the shuffle
32836 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32837 StVT.getScalarType(), NumElems*SizeRatio);
32839 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32841 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
32842 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
32843 for (unsigned i = 0; i != NumElems; ++i)
32844 ShuffleVec[i] = i * SizeRatio;
32846 // Can't shuffle using an illegal type.
32847 if (!TLI.isTypeLegal(WideVecVT))
32850 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32851 DAG.getUNDEF(WideVecVT),
32853 // At this point all of the data is stored at the bottom of the
32854 // register. We now need to save it to mem.
32856 // Find the largest store unit
32857 MVT StoreType = MVT::i8;
32858 for (MVT Tp : MVT::integer_valuetypes()) {
32859 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
32863 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
32864 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
32865 (64 <= NumElems * ToSz))
32866 StoreType = MVT::f64;
32868 // Bitcast the original vector into a vector of store-size units
32869 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
32870 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
32871 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
32872 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
32873 SmallVector<SDValue, 8> Chains;
32874 SDValue Ptr = St->getBasePtr();
32876 // Perform one or more big stores into memory.
32877 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
32878 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
32879 StoreType, ShuffWide,
32880 DAG.getIntPtrConstant(i, dl));
32882 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
32883 St->getAlignment(), St->getMemOperand()->getFlags());
32884 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
32885 Chains.push_back(Ch);
32888 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
32891 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
32892 // the FP state in cases where an emms may be missing.
32893 // A preferable solution to the general problem is to figure out the right
32894 // places to insert EMMS. This qualifies as a quick hack.
32896 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
32897 if (VT.getSizeInBits() != 64)
32900 const Function *F = DAG.getMachineFunction().getFunction();
32901 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
32903 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
32904 if ((VT.isVector() ||
32905 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
32906 isa<LoadSDNode>(St->getValue()) &&
32907 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
32908 St->getChain().hasOneUse() && !St->isVolatile()) {
32909 SDNode* LdVal = St->getValue().getNode();
32910 LoadSDNode *Ld = nullptr;
32911 int TokenFactorIndex = -1;
32912 SmallVector<SDValue, 8> Ops;
32913 SDNode* ChainVal = St->getChain().getNode();
32914 // Must be a store of a load. We currently handle two cases: the load
32915 // is a direct child, and it's under an intervening TokenFactor. It is
32916 // possible to dig deeper under nested TokenFactors.
32917 if (ChainVal == LdVal)
32918 Ld = cast<LoadSDNode>(St->getChain());
32919 else if (St->getValue().hasOneUse() &&
32920 ChainVal->getOpcode() == ISD::TokenFactor) {
32921 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
32922 if (ChainVal->getOperand(i).getNode() == LdVal) {
32923 TokenFactorIndex = i;
32924 Ld = cast<LoadSDNode>(St->getValue());
32926 Ops.push_back(ChainVal->getOperand(i));
32930 if (!Ld || !ISD::isNormalLoad(Ld))
32933 // If this is not the MMX case, i.e. we are just turning i64 load/store
32934 // into f64 load/store, avoid the transformation if there are multiple
32935 // uses of the loaded value.
32936 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
32941 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
32942 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
32944 if (Subtarget.is64Bit() || F64IsLegal) {
32945 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
32946 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
32947 Ld->getPointerInfo(), Ld->getAlignment(),
32948 Ld->getMemOperand()->getFlags());
32949 SDValue NewChain = NewLd.getValue(1);
32950 if (TokenFactorIndex >= 0) {
32951 Ops.push_back(NewChain);
32952 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32954 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
32955 St->getPointerInfo(), St->getAlignment(),
32956 St->getMemOperand()->getFlags());
32959 // Otherwise, lower to two pairs of 32-bit loads / stores.
32960 SDValue LoAddr = Ld->getBasePtr();
32961 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
32963 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
32964 Ld->getPointerInfo(), Ld->getAlignment(),
32965 Ld->getMemOperand()->getFlags());
32966 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
32967 Ld->getPointerInfo().getWithOffset(4),
32968 MinAlign(Ld->getAlignment(), 4),
32969 Ld->getMemOperand()->getFlags());
32971 SDValue NewChain = LoLd.getValue(1);
32972 if (TokenFactorIndex >= 0) {
32973 Ops.push_back(LoLd);
32974 Ops.push_back(HiLd);
32975 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32978 LoAddr = St->getBasePtr();
32979 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
32982 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
32983 St->getAlignment(), St->getMemOperand()->getFlags());
32984 SDValue HiSt = DAG.getStore(
32985 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
32986 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
32987 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
32990 // This is similar to the above case, but here we handle a scalar 64-bit
32991 // integer store that is extracted from a vector on a 32-bit target.
32992 // If we have SSE2, then we can treat it like a floating-point double
32993 // to get past legalization. The execution dependencies fixup pass will
32994 // choose the optimal machine instruction for the store if this really is
32995 // an integer or v2f32 rather than an f64.
32996 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
32997 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
32998 SDValue OldExtract = St->getOperand(1);
32999 SDValue ExtOp0 = OldExtract.getOperand(0);
33000 unsigned VecSize = ExtOp0.getValueSizeInBits();
33001 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
33002 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
33003 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
33004 BitCast, OldExtract.getOperand(1));
33005 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
33006 St->getPointerInfo(), St->getAlignment(),
33007 St->getMemOperand()->getFlags());
33013 /// Return 'true' if this vector operation is "horizontal"
33014 /// and return the operands for the horizontal operation in LHS and RHS. A
33015 /// horizontal operation performs the binary operation on successive elements
33016 /// of its first operand, then on successive elements of its second operand,
33017 /// returning the resulting values in a vector. For example, if
33018 /// A = < float a0, float a1, float a2, float a3 >
33020 /// B = < float b0, float b1, float b2, float b3 >
33021 /// then the result of doing a horizontal operation on A and B is
33022 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
33023 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
33024 /// A horizontal-op B, for some already available A and B, and if so then LHS is
33025 /// set to A, RHS to B, and the routine returns 'true'.
33026 /// Note that the binary operation should have the property that if one of the
33027 /// operands is UNDEF then the result is UNDEF.
33028 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
33029 // Look for the following pattern: if
33030 // A = < float a0, float a1, float a2, float a3 >
33031 // B = < float b0, float b1, float b2, float b3 >
33033 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
33034 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
33035 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
33036 // which is A horizontal-op B.
33038 // At least one of the operands should be a vector shuffle.
33039 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
33040 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
33043 MVT VT = LHS.getSimpleValueType();
33045 assert((VT.is128BitVector() || VT.is256BitVector()) &&
33046 "Unsupported vector type for horizontal add/sub");
33048 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
33049 // operate independently on 128-bit lanes.
33050 unsigned NumElts = VT.getVectorNumElements();
33051 unsigned NumLanes = VT.getSizeInBits()/128;
33052 unsigned NumLaneElts = NumElts / NumLanes;
33053 assert((NumLaneElts % 2 == 0) &&
33054 "Vector type should have an even number of elements in each lane");
33055 unsigned HalfLaneElts = NumLaneElts/2;
33057 // View LHS in the form
33058 // LHS = VECTOR_SHUFFLE A, B, LMask
33059 // If LHS is not a shuffle then pretend it is the shuffle
33060 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
33061 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
33064 SmallVector<int, 16> LMask(NumElts);
33065 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33066 if (!LHS.getOperand(0).isUndef())
33067 A = LHS.getOperand(0);
33068 if (!LHS.getOperand(1).isUndef())
33069 B = LHS.getOperand(1);
33070 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
33071 std::copy(Mask.begin(), Mask.end(), LMask.begin());
33073 if (!LHS.isUndef())
33075 for (unsigned i = 0; i != NumElts; ++i)
33079 // Likewise, view RHS in the form
33080 // RHS = VECTOR_SHUFFLE C, D, RMask
33082 SmallVector<int, 16> RMask(NumElts);
33083 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33084 if (!RHS.getOperand(0).isUndef())
33085 C = RHS.getOperand(0);
33086 if (!RHS.getOperand(1).isUndef())
33087 D = RHS.getOperand(1);
33088 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33089 std::copy(Mask.begin(), Mask.end(), RMask.begin());
33091 if (!RHS.isUndef())
33093 for (unsigned i = 0; i != NumElts; ++i)
33097 // Check that the shuffles are both shuffling the same vectors.
33098 if (!(A == C && B == D) && !(A == D && B == C))
33101 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33102 if (!A.getNode() && !B.getNode())
33105 // If A and B occur in reverse order in RHS, then "swap" them (which means
33106 // rewriting the mask).
33108 ShuffleVectorSDNode::commuteMask(RMask);
33110 // At this point LHS and RHS are equivalent to
33111 // LHS = VECTOR_SHUFFLE A, B, LMask
33112 // RHS = VECTOR_SHUFFLE A, B, RMask
33113 // Check that the masks correspond to performing a horizontal operation.
33114 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
33115 for (unsigned i = 0; i != NumLaneElts; ++i) {
33116 int LIdx = LMask[i+l], RIdx = RMask[i+l];
33118 // Ignore any UNDEF components.
33119 if (LIdx < 0 || RIdx < 0 ||
33120 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
33121 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
33124 // Check that successive elements are being operated on. If not, this is
33125 // not a horizontal operation.
33126 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33127 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
33128 if (!(LIdx == Index && RIdx == Index + 1) &&
33129 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
33134 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
33135 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33139 /// Do target-specific dag combines on floating-point adds/subs.
33140 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33141 const X86Subtarget &Subtarget) {
33142 EVT VT = N->getValueType(0);
33143 SDValue LHS = N->getOperand(0);
33144 SDValue RHS = N->getOperand(1);
33145 bool IsFadd = N->getOpcode() == ISD::FADD;
33146 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33148 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33149 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
33150 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
33151 isHorizontalBinOp(LHS, RHS, IsFadd)) {
33152 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33153 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33158 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33160 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33161 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33162 const X86Subtarget &Subtarget,
33164 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33165 SDValue Src = N->getOperand(0);
33166 unsigned Opcode = Src.getOpcode();
33167 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33169 EVT VT = N->getValueType(0);
33170 EVT SrcVT = Src.getValueType();
33172 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33173 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33175 // Repeated operand, so we are only trading one output truncation for
33176 // one input truncation.
33180 // See if either operand has been extended from a smaller/equal size to
33181 // the truncation size, allowing a truncation to combine with the extend.
33182 unsigned Opcode0 = Op0.getOpcode();
33183 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
33184 Opcode0 == ISD::ZERO_EXTEND) &&
33185 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33188 unsigned Opcode1 = Op1.getOpcode();
33189 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
33190 Opcode1 == ISD::ZERO_EXTEND) &&
33191 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33194 // See if either operand is a single use constant which can be constant
33196 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33197 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33198 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33199 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33202 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33203 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33204 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33205 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33208 // Don't combine if the operation has other uses.
33209 if (!N->isOnlyUserOf(Src.getNode()))
33212 // Only support vector truncation for now.
33213 // TODO: i64 scalar math would benefit as well.
33214 if (!VT.isVector())
33217 // In most cases its only worth pre-truncating if we're only facing the cost
33218 // of one truncation.
33219 // i.e. if one of the inputs will constant fold or the input is repeated.
33224 SDValue Op0 = Src.getOperand(0);
33225 SDValue Op1 = Src.getOperand(1);
33226 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33227 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33228 return TruncateArithmetic(Op0, Op1);
33233 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33234 // better to truncate if we have the chance.
33235 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33236 !TLI.isOperationLegal(Opcode, SrcVT))
33237 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33240 SDValue Op0 = Src.getOperand(0);
33241 SDValue Op1 = Src.getOperand(1);
33242 if (TLI.isOperationLegal(Opcode, VT) &&
33243 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33244 return TruncateArithmetic(Op0, Op1);
33252 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33254 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33255 SmallVector<SDValue, 8> &Regs) {
33256 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33257 Regs[0].getValueType() == MVT::v2i64));
33258 EVT OutVT = N->getValueType(0);
33259 EVT OutSVT = OutVT.getVectorElementType();
33260 EVT InVT = Regs[0].getValueType();
33261 EVT InSVT = InVT.getVectorElementType();
33264 // First, use mask to unset all bits that won't appear in the result.
33265 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33266 "OutSVT can only be either i8 or i16.");
33268 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33269 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33270 for (auto &Reg : Regs)
33271 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33273 MVT UnpackedVT, PackedVT;
33274 if (OutSVT == MVT::i8) {
33275 UnpackedVT = MVT::v8i16;
33276 PackedVT = MVT::v16i8;
33278 UnpackedVT = MVT::v4i32;
33279 PackedVT = MVT::v8i16;
33282 // In each iteration, truncate the type by a half size.
33283 auto RegNum = Regs.size();
33284 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33285 j < e; j *= 2, RegNum /= 2) {
33286 for (unsigned i = 0; i < RegNum; i++)
33287 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33288 for (unsigned i = 0; i < RegNum / 2; i++)
33289 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33293 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33294 // then extract a subvector as the result since v8i8 is not a legal type.
33295 if (OutVT == MVT::v8i8) {
33296 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33297 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33298 DAG.getIntPtrConstant(0, DL));
33300 } else if (RegNum > 1) {
33301 Regs.resize(RegNum);
33302 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33307 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33309 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33311 SmallVector<SDValue, 8> &Regs) {
33312 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33313 EVT OutVT = N->getValueType(0);
33316 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33317 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33318 for (auto &Reg : Regs) {
33319 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33321 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33325 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33326 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33329 if (Regs.size() > 2) {
33330 Regs.resize(Regs.size() / 2);
33331 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33336 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33337 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33338 /// legalization the truncation will be translated into a BUILD_VECTOR with each
33339 /// element that is extracted from a vector and then truncated, and it is
33340 /// difficult to do this optimization based on them.
33341 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33342 const X86Subtarget &Subtarget) {
33343 EVT OutVT = N->getValueType(0);
33344 if (!OutVT.isVector())
33347 SDValue In = N->getOperand(0);
33348 if (!In.getValueType().isSimple())
33351 EVT InVT = In.getValueType();
33352 unsigned NumElems = OutVT.getVectorNumElements();
33354 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33355 // SSE2, and we need to take care of it specially.
33356 // AVX512 provides vpmovdb.
33357 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
33360 EVT OutSVT = OutVT.getVectorElementType();
33361 EVT InSVT = InVT.getVectorElementType();
33362 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
33363 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33367 // SSSE3's pshufb results in less instructions in the cases below.
33368 if (Subtarget.hasSSSE3() && NumElems == 8 &&
33369 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
33370 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
33375 // Split a long vector into vectors of legal type.
33376 unsigned RegNum = InVT.getSizeInBits() / 128;
33377 SmallVector<SDValue, 8> SubVec(RegNum);
33378 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33379 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33381 for (unsigned i = 0; i < RegNum; i++)
33382 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33383 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33385 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33386 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33387 // truncate 2 x v4i32 to v8i16.
33388 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
33389 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33390 else if (InSVT == MVT::i32)
33391 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33396 /// This function transforms vector truncation of 'all or none' bits values.
33397 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33398 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33400 const X86Subtarget &Subtarget) {
33401 // Requires SSE2 but AVX512 has fast truncate.
33402 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
33405 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
33408 SDValue In = N->getOperand(0);
33409 if (!In.getValueType().isSimple())
33412 MVT VT = N->getValueType(0).getSimpleVT();
33413 MVT SVT = VT.getScalarType();
33415 MVT InVT = In.getValueType().getSimpleVT();
33416 MVT InSVT = InVT.getScalarType();
33418 // Use PACKSS if the input is a splatted sign bit.
33419 // e.g. Comparison result, sext_in_reg, etc.
33420 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33421 if (NumSignBits != InSVT.getSizeInBits())
33424 // Check we have a truncation suited for PACKSS.
33425 if (!VT.is128BitVector() && !VT.is256BitVector())
33427 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33429 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33432 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33435 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33436 const X86Subtarget &Subtarget) {
33437 EVT VT = N->getValueType(0);
33438 SDValue Src = N->getOperand(0);
33441 // Attempt to pre-truncate inputs to arithmetic ops instead.
33442 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33445 // Try to detect AVG pattern first.
33446 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33449 // Try to combine truncation with unsigned saturation.
33450 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33453 // The bitcast source is a direct mmx result.
33454 // Detect bitcasts between i32 to x86mmx
33455 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33456 SDValue BCSrc = Src.getOperand(0);
33457 if (BCSrc.getValueType() == MVT::x86mmx)
33458 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33461 // Try to truncate extended sign bits with PACKSS.
33462 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33465 return combineVectorTruncation(N, DAG, Subtarget);
33468 /// Returns the negated value if the node \p N flips sign of FP value.
33470 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33471 /// AVX512F does not have FXOR, so FNEG is lowered as
33472 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33473 /// In this case we go though all bitcasts.
33474 static SDValue isFNEG(SDNode *N) {
33475 if (N->getOpcode() == ISD::FNEG)
33476 return N->getOperand(0);
33478 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33479 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33482 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33483 if (!Op1.getValueType().isFloatingPoint())
33486 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33488 unsigned EltBits = Op1.getScalarValueSizeInBits();
33489 auto isSignMask = [&](const ConstantFP *C) {
33490 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33493 // There is more than one way to represent the same constant on
33494 // the different X86 targets. The type of the node may also depend on size.
33495 // - load scalar value and broadcast
33496 // - BUILD_VECTOR node
33497 // - load from a constant pool.
33498 // We check all variants here.
33499 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33500 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33501 if (isSignMask(cast<ConstantFP>(C)))
33504 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33505 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33506 if (isSignMask(CN->getConstantFPValue()))
33509 } else if (auto *C = getTargetConstantFromNode(Op1)) {
33510 if (C->getType()->isVectorTy()) {
33511 if (auto *SplatV = C->getSplatValue())
33512 if (isSignMask(cast<ConstantFP>(SplatV)))
33514 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
33515 if (isSignMask(FPConst))
33521 /// Do target-specific dag combines on floating point negations.
33522 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
33523 const X86Subtarget &Subtarget) {
33524 EVT OrigVT = N->getValueType(0);
33525 SDValue Arg = isFNEG(N);
33526 assert(Arg.getNode() && "N is expected to be an FNEG node");
33528 EVT VT = Arg.getValueType();
33529 EVT SVT = VT.getScalarType();
33532 // Let legalize expand this if it isn't a legal type yet.
33533 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33536 // If we're negating a FMUL node on a target with FMA, then we can avoid the
33537 // use of a constant by performing (-0 - A*B) instead.
33538 // FIXME: Check rounding control flags as well once it becomes available.
33539 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
33540 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
33541 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
33542 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
33543 Arg.getOperand(1), Zero);
33544 return DAG.getBitcast(OrigVT, NewNode);
33547 // If we're negating an FMA node, then we can adjust the
33548 // instruction to include the extra negation.
33549 unsigned NewOpcode = 0;
33550 if (Arg.hasOneUse()) {
33551 switch (Arg.getOpcode()) {
33552 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
33553 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
33554 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
33555 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
33556 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
33557 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
33558 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
33559 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
33560 // We can't handle scalar intrinsic node here because it would only
33561 // invert one element and not the whole vector. But we could try to handle
33562 // a negation of the lower element only.
33566 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
33567 Arg.getNode()->ops()));
33572 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
33573 const X86Subtarget &Subtarget) {
33574 MVT VT = N->getSimpleValueType(0);
33575 // If we have integer vector types available, use the integer opcodes.
33576 if (VT.isVector() && Subtarget.hasSSE2()) {
33579 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
33581 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
33582 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
33583 unsigned IntOpcode;
33584 switch (N->getOpcode()) {
33585 default: llvm_unreachable("Unexpected FP logic op");
33586 case X86ISD::FOR: IntOpcode = ISD::OR; break;
33587 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
33588 case X86ISD::FAND: IntOpcode = ISD::AND; break;
33589 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
33591 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
33592 return DAG.getBitcast(VT, IntOp);
33597 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
33598 TargetLowering::DAGCombinerInfo &DCI,
33599 const X86Subtarget &Subtarget) {
33600 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
33603 if (DCI.isBeforeLegalizeOps())
33606 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
33609 if (Subtarget.hasCMov())
33610 if (SDValue RV = combineIntegerAbs(N, DAG))
33613 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33617 return combineFneg(N, DAG, Subtarget);
33622 static bool isNullFPScalarOrVectorConst(SDValue V) {
33623 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
33626 /// If a value is a scalar FP zero or a vector FP zero (potentially including
33627 /// undefined elements), return a zero constant that may be used to fold away
33628 /// that value. In the case of a vector, the returned constant will not contain
33629 /// undefined elements even if the input parameter does. This makes it suitable
33630 /// to be used as a replacement operand with operations (eg, bitwise-and) where
33631 /// an undef should not propagate.
33632 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
33633 const X86Subtarget &Subtarget) {
33634 if (!isNullFPScalarOrVectorConst(V))
33637 if (V.getValueType().isVector())
33638 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
33643 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
33644 const X86Subtarget &Subtarget) {
33645 SDValue N0 = N->getOperand(0);
33646 SDValue N1 = N->getOperand(1);
33647 EVT VT = N->getValueType(0);
33650 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
33651 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
33652 (VT == MVT::f64 && Subtarget.hasSSE2())))
33655 auto isAllOnesConstantFP = [](SDValue V) {
33656 auto *C = dyn_cast<ConstantFPSDNode>(V);
33657 return C && C->getConstantFPValue()->isAllOnesValue();
33660 // fand (fxor X, -1), Y --> fandn X, Y
33661 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
33662 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
33664 // fand X, (fxor Y, -1) --> fandn Y, X
33665 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
33666 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
33671 /// Do target-specific dag combines on X86ISD::FAND nodes.
33672 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
33673 const X86Subtarget &Subtarget) {
33674 // FAND(0.0, x) -> 0.0
33675 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
33678 // FAND(x, 0.0) -> 0.0
33679 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33682 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
33685 return lowerX86FPLogicOp(N, DAG, Subtarget);
33688 /// Do target-specific dag combines on X86ISD::FANDN nodes.
33689 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
33690 const X86Subtarget &Subtarget) {
33691 // FANDN(0.0, x) -> x
33692 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33693 return N->getOperand(1);
33695 // FANDN(x, 0.0) -> 0.0
33696 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33699 return lowerX86FPLogicOp(N, DAG, Subtarget);
33702 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
33703 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
33704 const X86Subtarget &Subtarget) {
33705 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
33707 // F[X]OR(0.0, x) -> x
33708 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33709 return N->getOperand(1);
33711 // F[X]OR(x, 0.0) -> x
33712 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
33713 return N->getOperand(0);
33716 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
33719 return lowerX86FPLogicOp(N, DAG, Subtarget);
33722 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
33723 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
33724 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
33726 // Only perform optimizations if UnsafeMath is used.
33727 if (!DAG.getTarget().Options.UnsafeFPMath)
33730 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
33731 // into FMINC and FMAXC, which are Commutative operations.
33732 unsigned NewOp = 0;
33733 switch (N->getOpcode()) {
33734 default: llvm_unreachable("unknown opcode");
33735 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
33736 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
33739 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
33740 N->getOperand(0), N->getOperand(1));
33743 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
33744 const X86Subtarget &Subtarget) {
33745 if (Subtarget.useSoftFloat())
33748 // TODO: Check for global or instruction-level "nnan". In that case, we
33749 // should be able to lower to FMAX/FMIN alone.
33750 // TODO: If an operand is already known to be a NaN or not a NaN, this
33751 // should be an optional swap and FMAX/FMIN.
33753 EVT VT = N->getValueType(0);
33754 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
33755 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
33756 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
33759 // This takes at least 3 instructions, so favor a library call when operating
33760 // on a scalar and minimizing code size.
33761 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
33764 SDValue Op0 = N->getOperand(0);
33765 SDValue Op1 = N->getOperand(1);
33767 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
33768 DAG.getDataLayout(), *DAG.getContext(), VT);
33770 // There are 4 possibilities involving NaN inputs, and these are the required
33774 // ----------------
33775 // Num | Max | Op0 |
33776 // Op0 ----------------
33777 // NaN | Op1 | NaN |
33778 // ----------------
33780 // The SSE FP max/min instructions were not designed for this case, but rather
33782 // Min = Op1 < Op0 ? Op1 : Op0
33783 // Max = Op1 > Op0 ? Op1 : Op0
33785 // So they always return Op0 if either input is a NaN. However, we can still
33786 // use those instructions for fmaxnum by selecting away a NaN input.
33788 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
33789 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
33790 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
33791 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
33793 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
33794 // are NaN, the NaN value of Op1 is the result.
33795 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
33798 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
33799 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
33800 TargetLowering::DAGCombinerInfo &DCI,
33801 const X86Subtarget &Subtarget) {
33802 // ANDNP(0, x) -> x
33803 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
33804 return N->getOperand(1);
33806 // ANDNP(x, 0) -> 0
33807 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
33808 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
33810 EVT VT = N->getValueType(0);
33812 // Attempt to recursively combine a bitmask ANDNP with shuffles.
33813 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33815 SmallVector<int, 1> NonceMask; // Just a placeholder.
33816 NonceMask.push_back(0);
33817 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
33818 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
33820 return SDValue(); // This routine will use CombineTo to replace N.
33826 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
33827 TargetLowering::DAGCombinerInfo &DCI) {
33828 // BT ignores high bits in the bit index operand.
33829 SDValue Op1 = N->getOperand(1);
33830 if (Op1.hasOneUse()) {
33831 unsigned BitWidth = Op1.getValueSizeInBits();
33832 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
33834 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
33835 !DCI.isBeforeLegalizeOps());
33836 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33837 if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
33838 TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
33839 DCI.CommitTargetLoweringOpt(TLO);
33844 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
33845 const X86Subtarget &Subtarget) {
33846 EVT VT = N->getValueType(0);
33847 if (!VT.isVector())
33850 SDValue N0 = N->getOperand(0);
33851 SDValue N1 = N->getOperand(1);
33852 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
33855 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
33856 // both SSE and AVX2 since there is no sign-extended shift right
33857 // operation on a vector with 64-bit elements.
33858 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
33859 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
33860 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
33861 N0.getOpcode() == ISD::SIGN_EXTEND)) {
33862 SDValue N00 = N0.getOperand(0);
33864 // EXTLOAD has a better solution on AVX2,
33865 // it may be replaced with X86ISD::VSEXT node.
33866 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
33867 if (!ISD::isNormalLoad(N00.getNode()))
33870 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
33871 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
33873 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
33879 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
33880 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
33881 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
33882 /// opportunities to combine math ops, use an LEA, or use a complex addressing
33883 /// mode. This can eliminate extend, add, and shift instructions.
33884 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
33885 const X86Subtarget &Subtarget) {
33886 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
33887 Ext->getOpcode() != ISD::ZERO_EXTEND)
33890 // TODO: This should be valid for other integer types.
33891 EVT VT = Ext->getValueType(0);
33892 if (VT != MVT::i64)
33895 SDValue Add = Ext->getOperand(0);
33896 if (Add.getOpcode() != ISD::ADD)
33899 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
33900 bool NSW = Add->getFlags().hasNoSignedWrap();
33901 bool NUW = Add->getFlags().hasNoUnsignedWrap();
33903 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
33905 if ((Sext && !NSW) || (!Sext && !NUW))
33908 // Having a constant operand to the 'add' ensures that we are not increasing
33909 // the instruction count because the constant is extended for free below.
33910 // A constant operand can also become the displacement field of an LEA.
33911 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
33915 // Don't make the 'add' bigger if there's no hope of combining it with some
33916 // other 'add' or 'shl' instruction.
33917 // TODO: It may be profitable to generate simpler LEA instructions in place
33918 // of single 'add' instructions, but the cost model for selecting an LEA
33919 // currently has a high threshold.
33920 bool HasLEAPotential = false;
33921 for (auto *User : Ext->uses()) {
33922 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
33923 HasLEAPotential = true;
33927 if (!HasLEAPotential)
33930 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
33931 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
33932 SDValue AddOp0 = Add.getOperand(0);
33933 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
33934 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
33936 // The wider add is guaranteed to not wrap because both operands are
33939 Flags.setNoSignedWrap(NSW);
33940 Flags.setNoUnsignedWrap(NUW);
33941 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
33944 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
33945 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
33946 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
33947 /// extends from AH (which we otherwise need to do contortions to access).
33948 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
33949 SDValue N0 = N->getOperand(0);
33950 auto OpcodeN = N->getOpcode();
33951 auto OpcodeN0 = N0.getOpcode();
33952 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
33953 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
33956 EVT VT = N->getValueType(0);
33957 EVT InVT = N0.getValueType();
33958 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
33961 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
33962 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
33963 : X86ISD::UDIVREM8_ZEXT_HREG;
33964 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
33966 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
33967 return R.getValue(1);
33970 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
33971 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
33972 /// with UNDEFs) of the input to vectors of the same size as the target type
33973 /// which then extends the lowest elements.
33974 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
33975 TargetLowering::DAGCombinerInfo &DCI,
33976 const X86Subtarget &Subtarget) {
33977 unsigned Opcode = N->getOpcode();
33978 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
33980 if (!DCI.isBeforeLegalizeOps())
33982 if (!Subtarget.hasSSE2())
33985 SDValue N0 = N->getOperand(0);
33986 EVT VT = N->getValueType(0);
33987 EVT SVT = VT.getScalarType();
33988 EVT InVT = N0.getValueType();
33989 EVT InSVT = InVT.getScalarType();
33991 // Input type must be a vector and we must be extending legal integer types.
33992 if (!VT.isVector())
33994 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
33996 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
33999 // On AVX2+ targets, if the input/output types are both legal then we will be
34000 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
34001 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
34002 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
34007 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
34008 EVT InVT = N.getValueType();
34009 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
34010 Size / InVT.getScalarSizeInBits());
34011 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
34012 DAG.getUNDEF(InVT));
34014 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
34017 // If target-size is less than 128-bits, extend to a type that would extend
34018 // to 128 bits, extend that and extract the original target vector.
34019 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
34020 unsigned Scale = 128 / VT.getSizeInBits();
34022 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
34023 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
34024 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
34025 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
34026 DAG.getIntPtrConstant(0, DL));
34029 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
34030 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
34031 // Also use this if we don't have SSE41 to allow the legalizer do its job.
34032 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
34033 (VT.is256BitVector() && Subtarget.hasInt256()) ||
34034 (VT.is512BitVector() && Subtarget.hasAVX512())) {
34035 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
34036 return Opcode == ISD::SIGN_EXTEND
34037 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
34038 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
34041 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
34042 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
34043 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
34044 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
34045 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
34047 SmallVector<SDValue, 8> Opnds;
34048 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
34049 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
34050 DAG.getIntPtrConstant(Offset, DL));
34051 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
34052 SrcVec = Opcode == ISD::SIGN_EXTEND
34053 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
34054 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
34055 Opnds.push_back(SrcVec);
34057 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
34060 // On pre-AVX2 targets, split into 128-bit nodes of
34061 // ISD::*_EXTEND_VECTOR_INREG.
34062 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
34063 return SplitAndExtendInReg(128);
34065 // On pre-AVX512 targets, split into 256-bit nodes of
34066 // ISD::*_EXTEND_VECTOR_INREG.
34067 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
34068 return SplitAndExtendInReg(256);
34073 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
34074 TargetLowering::DAGCombinerInfo &DCI,
34075 const X86Subtarget &Subtarget) {
34076 SDValue N0 = N->getOperand(0);
34077 EVT VT = N->getValueType(0);
34078 EVT InVT = N0.getValueType();
34081 if (SDValue DivRem8 = getDivRem8(N, DAG))
34084 if (!DCI.isBeforeLegalizeOps()) {
34085 if (InVT == MVT::i1) {
34086 SDValue Zero = DAG.getConstant(0, DL, VT);
34087 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34088 return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
34093 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
34094 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
34095 // Invert and sign-extend a boolean is the same as zero-extend and subtract
34096 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34097 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34098 // sext (xor Bool, -1) --> sub (zext Bool), 1
34099 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34100 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34103 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34106 if (Subtarget.hasAVX() && VT.is256BitVector())
34107 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34110 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34116 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34117 const X86Subtarget &Subtarget) {
34119 EVT VT = N->getValueType(0);
34121 // Let legalize expand this if it isn't a legal type yet.
34122 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34125 EVT ScalarVT = VT.getScalarType();
34126 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
34129 SDValue A = N->getOperand(0);
34130 SDValue B = N->getOperand(1);
34131 SDValue C = N->getOperand(2);
34133 auto invertIfNegative = [](SDValue &V) {
34134 if (SDValue NegVal = isFNEG(V.getNode())) {
34141 // Do not convert the passthru input of scalar intrinsics.
34142 // FIXME: We could allow negations of the lower element only.
34143 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34144 bool NegB = invertIfNegative(B);
34145 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34147 // Negative multiplication when NegA xor NegB
34148 bool NegMul = (NegA != NegB);
34150 unsigned NewOpcode;
34152 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34154 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34157 if (N->getOpcode() == X86ISD::FMADD_RND) {
34158 switch (NewOpcode) {
34159 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
34160 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
34161 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34162 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34164 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34165 switch (NewOpcode) {
34166 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
34167 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
34168 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34169 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34171 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34172 switch (NewOpcode) {
34173 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
34174 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
34175 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34176 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34179 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
34180 "Unexpected opcode!");
34181 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34184 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34187 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34188 TargetLowering::DAGCombinerInfo &DCI,
34189 const X86Subtarget &Subtarget) {
34190 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
34191 // (and (i32 x86isd::setcc_carry), 1)
34192 // This eliminates the zext. This transformation is necessary because
34193 // ISD::SETCC is always legalized to i8.
34195 SDValue N0 = N->getOperand(0);
34196 EVT VT = N->getValueType(0);
34198 if (N0.getOpcode() == ISD::AND &&
34200 N0.getOperand(0).hasOneUse()) {
34201 SDValue N00 = N0.getOperand(0);
34202 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34203 if (!isOneConstant(N0.getOperand(1)))
34205 return DAG.getNode(ISD::AND, dl, VT,
34206 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34207 N00.getOperand(0), N00.getOperand(1)),
34208 DAG.getConstant(1, dl, VT));
34212 if (N0.getOpcode() == ISD::TRUNCATE &&
34214 N0.getOperand(0).hasOneUse()) {
34215 SDValue N00 = N0.getOperand(0);
34216 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34217 return DAG.getNode(ISD::AND, dl, VT,
34218 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34219 N00.getOperand(0), N00.getOperand(1)),
34220 DAG.getConstant(1, dl, VT));
34224 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34227 if (VT.is256BitVector())
34228 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34231 if (SDValue DivRem8 = getDivRem8(N, DAG))
34234 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34237 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34243 /// Try to map a 128-bit or larger integer comparison to vector instructions
34244 /// before type legalization splits it up into chunks.
34245 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34246 const X86Subtarget &Subtarget) {
34247 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34248 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34250 // We're looking for an oversized integer equality comparison, but ignore a
34251 // comparison with zero because that gets special treatment in EmitTest().
34252 SDValue X = SetCC->getOperand(0);
34253 SDValue Y = SetCC->getOperand(1);
34254 EVT OpVT = X.getValueType();
34255 unsigned OpSize = OpVT.getSizeInBits();
34256 if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
34259 // TODO: Use PXOR + PTEST for SSE4.1 or later?
34260 // TODO: Add support for AVX-512.
34261 EVT VT = SetCC->getValueType(0);
34263 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
34264 (OpSize == 256 && Subtarget.hasAVX2())) {
34265 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34266 SDValue VecX = DAG.getBitcast(VecVT, X);
34267 SDValue VecY = DAG.getBitcast(VecVT, Y);
34269 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34270 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34271 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34272 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34273 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34274 SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34275 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34276 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34278 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34284 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34285 const X86Subtarget &Subtarget) {
34286 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34287 SDValue LHS = N->getOperand(0);
34288 SDValue RHS = N->getOperand(1);
34289 EVT VT = N->getValueType(0);
34292 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
34293 EVT OpVT = LHS.getValueType();
34294 // 0-x == y --> x+y == 0
34295 // 0-x != y --> x+y != 0
34296 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34298 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34299 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34301 // x == 0-y --> x+y == 0
34302 // x != 0-y --> x+y != 0
34303 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34305 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34306 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34309 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34313 if (VT.getScalarType() == MVT::i1 &&
34314 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
34316 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34317 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34318 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34320 if (!IsSEXT0 || !IsVZero1) {
34321 // Swap the operands and update the condition code.
34322 std::swap(LHS, RHS);
34323 CC = ISD::getSetCCSwappedOperands(CC);
34325 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34326 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34327 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34330 if (IsSEXT0 && IsVZero1) {
34331 assert(VT == LHS.getOperand(0).getValueType() &&
34332 "Uexpected operand type");
34333 if (CC == ISD::SETGT)
34334 return DAG.getConstant(0, DL, VT);
34335 if (CC == ISD::SETLE)
34336 return DAG.getConstant(1, DL, VT);
34337 if (CC == ISD::SETEQ || CC == ISD::SETGE)
34338 return DAG.getNOT(DL, LHS.getOperand(0), VT);
34340 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
34341 "Unexpected condition code!");
34342 return LHS.getOperand(0);
34346 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34347 // to avoid scalarization via legalization because v4i32 is not a legal type.
34348 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34349 LHS.getValueType() == MVT::v4f32)
34350 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34355 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34357 // Gather and Scatter instructions use k-registers for masks. The type of
34358 // the masks is v*i1. So the mask will be truncated anyway.
34359 // The SIGN_EXTEND_INREG my be dropped.
34360 SDValue Mask = N->getOperand(2);
34361 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34362 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34363 NewOps[2] = Mask.getOperand(0);
34364 DAG.UpdateNodeOperands(N, NewOps);
34369 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34370 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34371 const X86Subtarget &Subtarget) {
34373 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34374 SDValue EFLAGS = N->getOperand(1);
34376 // Try to simplify the EFLAGS and condition code operands.
34377 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34378 return getSETCC(CC, Flags, DL, DAG);
34383 /// Optimize branch condition evaluation.
34384 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34385 const X86Subtarget &Subtarget) {
34387 SDValue EFLAGS = N->getOperand(3);
34388 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34390 // Try to simplify the EFLAGS and condition code operands.
34391 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
34392 // RAUW them under us.
34393 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34394 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34395 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34396 N->getOperand(1), Cond, Flags);
34402 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34403 SelectionDAG &DAG) {
34404 // Take advantage of vector comparisons producing 0 or -1 in each lane to
34405 // optimize away operation when it's from a constant.
34407 // The general transformation is:
34408 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34409 // AND(VECTOR_CMP(x,y), constant2)
34410 // constant2 = UNARYOP(constant)
34412 // Early exit if this isn't a vector operation, the operand of the
34413 // unary operation isn't a bitwise AND, or if the sizes of the operations
34414 // aren't the same.
34415 EVT VT = N->getValueType(0);
34416 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
34417 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
34418 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34421 // Now check that the other operand of the AND is a constant. We could
34422 // make the transformation for non-constant splats as well, but it's unclear
34423 // that would be a benefit as it would not eliminate any operations, just
34424 // perform one more step in scalar code before moving to the vector unit.
34425 if (BuildVectorSDNode *BV =
34426 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34427 // Bail out if the vector isn't a constant.
34428 if (!BV->isConstant())
34431 // Everything checks out. Build up the new and improved node.
34433 EVT IntVT = BV->getValueType(0);
34434 // Create a new constant of the appropriate type for the transformed
34436 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34437 // The AND node needs bitcasts to/from an integer vector type around it.
34438 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34439 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34440 N->getOperand(0)->getOperand(0), MaskConst);
34441 SDValue Res = DAG.getBitcast(VT, NewAnd);
34448 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34449 const X86Subtarget &Subtarget) {
34450 SDValue Op0 = N->getOperand(0);
34451 EVT VT = N->getValueType(0);
34452 EVT InVT = Op0.getValueType();
34453 EVT InSVT = InVT.getScalarType();
34454 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34456 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34457 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34458 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
34460 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34461 InVT.getVectorNumElements());
34462 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34464 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34465 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34467 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34470 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34471 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34472 // the optimization here.
34473 if (DAG.SignBitIsZero(Op0))
34474 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34479 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34480 const X86Subtarget &Subtarget) {
34481 // First try to optimize away the conversion entirely when it's
34482 // conditionally from a constant. Vectors only.
34483 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34486 // Now move on to more general possibilities.
34487 SDValue Op0 = N->getOperand(0);
34488 EVT VT = N->getValueType(0);
34489 EVT InVT = Op0.getValueType();
34490 EVT InSVT = InVT.getScalarType();
34492 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34493 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34494 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34495 if (InVT.isVector() &&
34496 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
34497 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34499 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34500 InVT.getVectorNumElements());
34501 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34502 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34505 // Without AVX512DQ we only support i64 to float scalar conversion. For both
34506 // vectors and scalars, see if we know that the upper bits are all the sign
34507 // bit, in which case we can truncate the input to i32 and convert from that.
34508 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
34509 unsigned BitWidth = InVT.getScalarSizeInBits();
34510 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
34511 if (NumSignBits >= (BitWidth - 31)) {
34512 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
34513 if (InVT.isVector())
34514 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
34515 InVT.getVectorNumElements());
34517 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
34518 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
34522 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
34523 // a 32-bit target where SSE doesn't support i64->FP operations.
34524 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
34525 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
34526 EVT LdVT = Ld->getValueType(0);
34528 // This transformation is not supported if the result type is f16 or f128.
34529 if (VT == MVT::f16 || VT == MVT::f128)
34532 if (!Ld->isVolatile() && !VT.isVector() &&
34533 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
34534 !Subtarget.is64Bit() && LdVT == MVT::i64) {
34535 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
34536 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
34537 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
34544 // Optimize RES, EFLAGS = X86ISD::ADD LHS, RHS
34545 static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG,
34546 X86TargetLowering::DAGCombinerInfo &DCI) {
34547 // When legalizing carry, we create carries via add X, -1
34548 // If that comes from an actual carry, via setcc, we use the
34550 if (isAllOnesConstant(N->getOperand(1)) && N->hasAnyUseOfValue(1)) {
34551 SDValue Carry = N->getOperand(0);
34552 while (Carry.getOpcode() == ISD::TRUNCATE ||
34553 Carry.getOpcode() == ISD::ZERO_EXTEND ||
34554 Carry.getOpcode() == ISD::SIGN_EXTEND ||
34555 Carry.getOpcode() == ISD::ANY_EXTEND ||
34556 (Carry.getOpcode() == ISD::AND &&
34557 isOneConstant(Carry.getOperand(1))))
34558 Carry = Carry.getOperand(0);
34560 if (Carry.getOpcode() == X86ISD::SETCC ||
34561 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
34562 if (Carry.getConstantOperandVal(0) == X86::COND_B)
34563 return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1));
34570 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
34571 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
34572 X86TargetLowering::DAGCombinerInfo &DCI) {
34573 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
34574 // the result is either zero or one (depending on the input carry bit).
34575 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
34576 if (X86::isZeroNode(N->getOperand(0)) &&
34577 X86::isZeroNode(N->getOperand(1)) &&
34578 // We don't have a good way to replace an EFLAGS use, so only do this when
34580 SDValue(N, 1).use_empty()) {
34582 EVT VT = N->getValueType(0);
34583 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
34584 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
34585 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
34586 DAG.getConstant(X86::COND_B, DL,
34589 DAG.getConstant(1, DL, VT));
34590 return DCI.CombineTo(N, Res1, CarryOut);
34596 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
34597 /// which is more useful than 0/1 in some cases.
34598 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
34600 // "Condition code B" is also known as "the carry flag" (CF).
34601 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
34602 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
34603 MVT VT = N->getSimpleValueType(0);
34605 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
34607 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
34608 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
34611 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
34612 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
34613 /// with CMP+{ADC, SBB}.
34614 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
34615 bool IsSub = N->getOpcode() == ISD::SUB;
34616 SDValue X = N->getOperand(0);
34617 SDValue Y = N->getOperand(1);
34619 // If this is an add, canonicalize a zext operand to the RHS.
34620 // TODO: Incomplete? What if both sides are zexts?
34621 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
34622 Y.getOpcode() != ISD::ZERO_EXTEND)
34625 // Look through a one-use zext.
34626 bool PeekedThroughZext = false;
34627 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
34628 Y = Y.getOperand(0);
34629 PeekedThroughZext = true;
34632 // If this is an add, canonicalize a setcc operand to the RHS.
34633 // TODO: Incomplete? What if both sides are setcc?
34634 // TODO: Should we allow peeking through a zext of the other operand?
34635 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
34636 Y.getOpcode() != X86ISD::SETCC)
34639 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
34643 EVT VT = N->getValueType(0);
34644 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
34646 if (CC == X86::COND_B) {
34647 // X + SETB Z --> X + (mask SBB Z, Z)
34648 // X - SETB Z --> X - (mask SBB Z, Z)
34649 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
34650 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
34651 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34652 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34653 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34656 if (CC == X86::COND_A) {
34657 SDValue EFLAGS = Y->getOperand(1);
34658 // Try to convert COND_A into COND_B in an attempt to facilitate
34659 // materializing "setb reg".
34661 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
34662 // cannot take an immediate as its first operand.
34664 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
34665 EFLAGS.getValueType().isInteger() &&
34666 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
34667 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
34668 EFLAGS.getNode()->getVTList(),
34669 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
34670 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
34671 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
34672 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34673 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34674 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34678 if (CC != X86::COND_E && CC != X86::COND_NE)
34681 SDValue Cmp = Y.getOperand(1);
34682 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
34683 !X86::isZeroNode(Cmp.getOperand(1)) ||
34684 !Cmp.getOperand(0).getValueType().isInteger())
34687 // (cmp Z, 1) sets the carry flag if Z is 0.
34688 SDValue Z = Cmp.getOperand(0);
34689 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
34690 DAG.getConstant(1, DL, Z.getValueType()));
34692 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
34694 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
34695 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
34696 if (CC == X86::COND_NE)
34697 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
34698 DAG.getConstant(-1ULL, DL, VT), NewCmp);
34700 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
34701 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
34702 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
34703 DAG.getConstant(0, DL, VT), NewCmp);
34706 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
34707 const X86Subtarget &Subtarget) {
34708 SDValue MulOp = N->getOperand(0);
34709 SDValue Phi = N->getOperand(1);
34711 if (MulOp.getOpcode() != ISD::MUL)
34712 std::swap(MulOp, Phi);
34713 if (MulOp.getOpcode() != ISD::MUL)
34717 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
34720 EVT VT = N->getValueType(0);
34722 unsigned RegSize = 128;
34723 if (Subtarget.hasBWI())
34725 else if (Subtarget.hasAVX2())
34727 unsigned VectorSize = VT.getVectorNumElements() * 16;
34728 // If the vector size is less than 128, or greater than the supported RegSize,
34729 // do not use PMADD.
34730 if (VectorSize < 128 || VectorSize > RegSize)
34734 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
34735 VT.getVectorNumElements());
34736 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34737 VT.getVectorNumElements() / 2);
34739 // Shrink the operands of mul.
34740 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
34741 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
34743 // Madd vector size is half of the original vector size
34744 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
34745 // Fill the rest of the output with 0
34746 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
34747 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
34748 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
34751 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
34752 const X86Subtarget &Subtarget) {
34754 EVT VT = N->getValueType(0);
34755 SDValue Op0 = N->getOperand(0);
34756 SDValue Op1 = N->getOperand(1);
34758 // TODO: There's nothing special about i32, any integer type above i16 should
34759 // work just as well.
34760 if (!VT.isVector() || !VT.isSimple() ||
34761 !(VT.getVectorElementType() == MVT::i32))
34764 unsigned RegSize = 128;
34765 if (Subtarget.hasBWI())
34767 else if (Subtarget.hasAVX2())
34770 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
34771 // TODO: We should be able to handle larger vectors by splitting them before
34772 // feeding them into several SADs, and then reducing over those.
34773 if (VT.getSizeInBits() / 4 > RegSize)
34776 // We know N is a reduction add, which means one of its operands is a phi.
34777 // To match SAD, we need the other operand to be a vector select.
34778 SDValue SelectOp, Phi;
34779 if (Op0.getOpcode() == ISD::VSELECT) {
34782 } else if (Op1.getOpcode() == ISD::VSELECT) {
34788 // Check whether we have an abs-diff pattern feeding into the select.
34789 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
34792 // SAD pattern detected. Now build a SAD instruction and an addition for
34793 // reduction. Note that the number of elements of the result of SAD is less
34794 // than the number of elements of its input. Therefore, we could only update
34795 // part of elements in the reduction vector.
34796 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
34798 // The output of PSADBW is a vector of i64.
34799 // We need to turn the vector of i64 into a vector of i32.
34800 // If the reduction vector is at least as wide as the psadbw result, just
34801 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
34803 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
34804 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
34805 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
34807 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
34809 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
34810 // Update part of elements of the reduction vector. This is done by first
34811 // extracting a sub-vector from it, updating this sub-vector, and inserting
34813 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
34814 DAG.getIntPtrConstant(0, DL));
34815 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
34816 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
34817 DAG.getIntPtrConstant(0, DL));
34819 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
34822 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
34823 const X86Subtarget &Subtarget) {
34824 const SDNodeFlags Flags = N->getFlags();
34825 if (Flags.hasVectorReduction()) {
34826 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
34828 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
34831 EVT VT = N->getValueType(0);
34832 SDValue Op0 = N->getOperand(0);
34833 SDValue Op1 = N->getOperand(1);
34835 // Try to synthesize horizontal adds from adds of shuffles.
34836 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34837 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34838 isHorizontalBinOp(Op0, Op1, true))
34839 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
34841 return combineAddOrSubToADCOrSBB(N, DAG);
34844 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
34845 const X86Subtarget &Subtarget) {
34846 SDValue Op0 = N->getOperand(0);
34847 SDValue Op1 = N->getOperand(1);
34849 // X86 can't encode an immediate LHS of a sub. See if we can push the
34850 // negation into a preceding instruction.
34851 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
34852 // If the RHS of the sub is a XOR with one use and a constant, invert the
34853 // immediate. Then add one to the LHS of the sub so we can turn
34854 // X-Y -> X+~Y+1, saving one register.
34855 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
34856 isa<ConstantSDNode>(Op1.getOperand(1))) {
34857 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
34858 EVT VT = Op0.getValueType();
34859 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
34861 DAG.getConstant(~XorC, SDLoc(Op1), VT));
34862 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
34863 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
34867 // Try to synthesize horizontal subs from subs of shuffles.
34868 EVT VT = N->getValueType(0);
34869 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34870 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34871 isHorizontalBinOp(Op0, Op1, false))
34872 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
34874 return combineAddOrSubToADCOrSBB(N, DAG);
34877 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
34878 TargetLowering::DAGCombinerInfo &DCI,
34879 const X86Subtarget &Subtarget) {
34880 if (DCI.isBeforeLegalize())
34884 unsigned Opcode = N->getOpcode();
34885 MVT VT = N->getSimpleValueType(0);
34886 MVT SVT = VT.getVectorElementType();
34887 unsigned NumElts = VT.getVectorNumElements();
34888 unsigned EltSizeInBits = SVT.getSizeInBits();
34890 SDValue Op = N->getOperand(0);
34891 MVT OpVT = Op.getSimpleValueType();
34892 MVT OpEltVT = OpVT.getVectorElementType();
34893 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
34894 unsigned InputBits = OpEltSizeInBits * NumElts;
34896 // Perform any constant folding.
34897 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
34899 SmallVector<APInt, 64> EltBits;
34900 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
34901 APInt Undefs(NumElts, 0);
34902 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
34904 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
34905 for (unsigned i = 0; i != NumElts; ++i) {
34906 if (UndefElts[i]) {
34910 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
34911 : EltBits[i].sextOrTrunc(EltSizeInBits);
34913 return getConstVector(Vals, Undefs, VT, DAG, DL);
34916 // (vzext (bitcast (vzext (x)) -> (vzext x)
34917 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
34918 SDValue V = peekThroughBitcasts(Op);
34919 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
34920 MVT InnerVT = V.getSimpleValueType();
34921 MVT InnerEltVT = InnerVT.getVectorElementType();
34923 // If the element sizes match exactly, we can just do one larger vzext. This
34924 // is always an exact type match as vzext operates on integer types.
34925 if (OpEltVT == InnerEltVT) {
34926 assert(OpVT == InnerVT && "Types must match for vzext!");
34927 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
34930 // The only other way we can combine them is if only a single element of the
34931 // inner vzext is used in the input to the outer vzext.
34932 if (InnerEltVT.getSizeInBits() < InputBits)
34935 // In this case, the inner vzext is completely dead because we're going to
34936 // only look at bits inside of the low element. Just do the outer vzext on
34937 // a bitcast of the input to the inner.
34938 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
34941 // Check if we can bypass extracting and re-inserting an element of an input
34942 // vector. Essentially:
34943 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
34944 // TODO: Add X86ISD::VSEXT support
34945 if (Opcode == X86ISD::VZEXT &&
34946 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
34947 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34948 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
34949 SDValue ExtractedV = V.getOperand(0);
34950 SDValue OrigV = ExtractedV.getOperand(0);
34951 if (isNullConstant(ExtractedV.getOperand(1))) {
34952 MVT OrigVT = OrigV.getSimpleValueType();
34953 // Extract a subvector if necessary...
34954 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
34955 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
34956 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
34957 OrigVT.getVectorNumElements() / Ratio);
34958 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
34959 DAG.getIntPtrConstant(0, DL));
34961 Op = DAG.getBitcast(OpVT, OrigV);
34962 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
34969 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
34970 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
34971 const X86Subtarget &Subtarget) {
34972 SDValue Chain = N->getOperand(0);
34973 SDValue LHS = N->getOperand(1);
34974 SDValue RHS = N->getOperand(2);
34975 MVT VT = RHS.getSimpleValueType();
34978 auto *C = dyn_cast<ConstantSDNode>(RHS);
34979 if (!C || C->getZExtValue() != 1)
34982 RHS = DAG.getConstant(-1, DL, VT);
34983 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
34984 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
34985 DAG.getVTList(MVT::i32, MVT::Other),
34986 {Chain, LHS, RHS}, VT, MMO);
34989 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
34990 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
34991 SDValue Op0 = N->getOperand(0);
34992 SDValue Op1 = N->getOperand(1);
34994 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
34997 EVT VT = N->getValueType(0);
35000 return DAG.getNode(X86ISD::TESTM, DL, VT,
35001 Op0->getOperand(0), Op0->getOperand(1));
35004 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
35005 const X86Subtarget &Subtarget) {
35006 MVT VT = N->getSimpleValueType(0);
35009 if (N->getOperand(0) == N->getOperand(1)) {
35010 if (N->getOpcode() == X86ISD::PCMPEQ)
35011 return getOnesVector(VT, DAG, DL);
35012 if (N->getOpcode() == X86ISD::PCMPGT)
35013 return getZeroVector(VT, Subtarget, DAG, DL);
35019 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
35020 TargetLowering::DAGCombinerInfo &DCI,
35021 const X86Subtarget &Subtarget) {
35022 if (DCI.isBeforeLegalizeOps())
35026 SDValue Vec = N->getOperand(0);
35027 SDValue SubVec = N->getOperand(1);
35028 SDValue Idx = N->getOperand(2);
35030 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
35031 MVT OpVT = N->getSimpleValueType(0);
35032 MVT SubVecVT = SubVec.getSimpleValueType();
35034 // If this is an insert of an extract, combine to a shuffle. Don't do this
35035 // if the insert or extract can be represented with a subvector operation.
35036 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
35037 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
35038 (IdxVal != 0 || !Vec.isUndef())) {
35039 int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
35040 if (ExtIdxVal != 0) {
35041 int VecNumElts = OpVT.getVectorNumElements();
35042 int SubVecNumElts = SubVecVT.getVectorNumElements();
35043 SmallVector<int, 64> Mask(VecNumElts);
35044 // First create an identity shuffle mask.
35045 for (int i = 0; i != VecNumElts; ++i)
35047 // Now insert the extracted portion.
35048 for (int i = 0; i != SubVecNumElts; ++i)
35049 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
35051 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
35055 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
35057 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35058 // (load16 addr + 16), Elts/2)
35061 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35062 // (load32 addr + 32), Elts/2)
35064 // or a 16-byte or 32-byte broadcast:
35065 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35066 // (load16 addr), Elts/2)
35067 // --> X86SubVBroadcast(load16 addr)
35069 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35070 // (load32 addr), Elts/2)
35071 // --> X86SubVBroadcast(load32 addr)
35072 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
35073 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
35074 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
35075 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
35076 if (Idx2 && Idx2->getZExtValue() == 0) {
35077 SDValue SubVec2 = Vec.getOperand(1);
35078 // If needed, look through bitcasts to get to the load.
35079 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
35081 unsigned Alignment = FirstLd->getAlignment();
35082 unsigned AS = FirstLd->getAddressSpace();
35083 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35084 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35085 OpVT, AS, Alignment, &Fast) && Fast) {
35086 SDValue Ops[] = {SubVec2, SubVec};
35087 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
35091 // If lower/upper loads are the same and the only users of the load, then
35092 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
35093 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
35094 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
35095 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
35096 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35099 // If this is subv_broadcast insert into both halves, use a larger
35101 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
35102 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35103 SubVec.getOperand(0));
35112 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35113 DAGCombinerInfo &DCI) const {
35114 SelectionDAG &DAG = DCI.DAG;
35115 switch (N->getOpcode()) {
35117 case ISD::EXTRACT_VECTOR_ELT:
35118 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35119 case X86ISD::PEXTRW:
35120 case X86ISD::PEXTRB:
35121 return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35122 case ISD::INSERT_SUBVECTOR:
35123 return combineInsertSubvector(N, DAG, DCI, Subtarget);
35126 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35127 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
35128 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
35129 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
35130 case ISD::SUB: return combineSub(N, DAG, Subtarget);
35131 case X86ISD::ADD: return combineX86ADD(N, DAG, DCI);
35132 case X86ISD::ADC: return combineADC(N, DAG, DCI);
35133 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
35136 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
35137 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
35138 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
35139 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
35140 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
35141 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
35142 case ISD::STORE: return combineStore(N, DAG, Subtarget);
35143 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
35144 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
35145 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
35147 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
35148 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
35149 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
35150 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
35151 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
35152 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
35154 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
35156 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
35158 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
35159 case X86ISD::BT: return combineBT(N, DAG, DCI);
35160 case ISD::ANY_EXTEND:
35161 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
35162 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
35163 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35164 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
35165 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
35166 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
35167 case X86ISD::VSHLI:
35168 case X86ISD::VSRAI:
35169 case X86ISD::VSRLI:
35170 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35171 case ISD::SIGN_EXTEND_VECTOR_INREG:
35172 case ISD::ZERO_EXTEND_VECTOR_INREG:
35173 case X86ISD::VSEXT:
35174 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
35175 case X86ISD::PINSRB:
35176 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
35177 case X86ISD::SHUFP: // Handle all target specific shuffles
35178 case X86ISD::INSERTPS:
35179 case X86ISD::PALIGNR:
35180 case X86ISD::VSHLDQ:
35181 case X86ISD::VSRLDQ:
35182 case X86ISD::BLENDI:
35183 case X86ISD::UNPCKH:
35184 case X86ISD::UNPCKL:
35185 case X86ISD::MOVHLPS:
35186 case X86ISD::MOVLHPS:
35187 case X86ISD::PSHUFB:
35188 case X86ISD::PSHUFD:
35189 case X86ISD::PSHUFHW:
35190 case X86ISD::PSHUFLW:
35191 case X86ISD::MOVSHDUP:
35192 case X86ISD::MOVSLDUP:
35193 case X86ISD::MOVDDUP:
35194 case X86ISD::MOVSS:
35195 case X86ISD::MOVSD:
35196 case X86ISD::VPPERM:
35197 case X86ISD::VPERMI:
35198 case X86ISD::VPERMV:
35199 case X86ISD::VPERMV3:
35200 case X86ISD::VPERMIV3:
35201 case X86ISD::VPERMIL2:
35202 case X86ISD::VPERMILPI:
35203 case X86ISD::VPERMILPV:
35204 case X86ISD::VPERM2X128:
35205 case X86ISD::VZEXT_MOVL:
35206 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35207 case X86ISD::FMADD:
35208 case X86ISD::FMADD_RND:
35209 case X86ISD::FMADDS1_RND:
35210 case X86ISD::FMADDS3_RND:
35211 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
35213 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
35214 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
35215 case X86ISD::TESTM: return combineTestM(N, DAG);
35216 case X86ISD::PCMPEQ:
35217 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
35223 /// Return true if the target has native support for the specified value type
35224 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
35225 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
35226 /// some i16 instructions are slow.
35227 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35228 if (!isTypeLegal(VT))
35230 if (VT != MVT::i16)
35237 case ISD::SIGN_EXTEND:
35238 case ISD::ZERO_EXTEND:
35239 case ISD::ANY_EXTEND:
35252 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35253 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35254 /// we don't adjust the stack we clobber the first frame index.
35255 /// See X86InstrInfo::copyPhysReg.
35256 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
35257 const MachineRegisterInfo &MRI = MF.getRegInfo();
35258 return any_of(MRI.reg_instructions(X86::EFLAGS),
35259 [](const MachineInstr &RI) { return RI.isCopy(); });
35262 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
35263 if (hasCopyImplyingStackAdjustment(MF)) {
35264 MachineFrameInfo &MFI = MF.getFrameInfo();
35265 MFI.setHasCopyImplyingStackAdjustment(true);
35268 TargetLoweringBase::finalizeLowering(MF);
35271 /// This method query the target whether it is beneficial for dag combiner to
35272 /// promote the specified node. If true, it should return the desired promotion
35273 /// type by reference.
35274 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35275 EVT VT = Op.getValueType();
35276 if (VT != MVT::i16)
35279 bool Promote = false;
35280 bool Commute = false;
35281 switch (Op.getOpcode()) {
35283 case ISD::SIGN_EXTEND:
35284 case ISD::ZERO_EXTEND:
35285 case ISD::ANY_EXTEND:
35290 SDValue N0 = Op.getOperand(0);
35291 // Look out for (store (shl (load), x)).
35292 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35305 SDValue N0 = Op.getOperand(0);
35306 SDValue N1 = Op.getOperand(1);
35307 if (!Commute && MayFoldLoad(N1))
35309 // Avoid disabling potential load folding opportunities.
35310 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
35312 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
35322 //===----------------------------------------------------------------------===//
35323 // X86 Inline Assembly Support
35324 //===----------------------------------------------------------------------===//
35326 // Helper to match a string separated by whitespace.
35327 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35328 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35330 for (StringRef Piece : Pieces) {
35331 if (!S.startswith(Piece)) // Check if the piece matches.
35334 S = S.substr(Piece.size());
35335 StringRef::size_type Pos = S.find_first_not_of(" \t");
35336 if (Pos == 0) // We matched a prefix.
35345 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35347 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
35348 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35349 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35350 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35352 if (AsmPieces.size() == 3)
35354 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35361 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35362 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35364 const std::string &AsmStr = IA->getAsmString();
35366 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35367 if (!Ty || Ty->getBitWidth() % 16 != 0)
35370 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
35371 SmallVector<StringRef, 4> AsmPieces;
35372 SplitString(AsmStr, AsmPieces, ";\n");
35374 switch (AsmPieces.size()) {
35375 default: return false;
35377 // FIXME: this should verify that we are targeting a 486 or better. If not,
35378 // we will turn this bswap into something that will be lowered to logical
35379 // ops instead of emitting the bswap asm. For now, we don't support 486 or
35380 // lower so don't worry about this.
35382 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
35383 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
35384 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
35385 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
35386 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
35387 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35388 // No need to check constraints, nothing other than the equivalent of
35389 // "=r,0" would be valid here.
35390 return IntrinsicLowering::LowerToByteSwap(CI);
35393 // rorw $$8, ${0:w} --> llvm.bswap.i16
35394 if (CI->getType()->isIntegerTy(16) &&
35395 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35396 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
35397 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35399 StringRef ConstraintsStr = IA->getConstraintString();
35400 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35401 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35402 if (clobbersFlagRegisters(AsmPieces))
35403 return IntrinsicLowering::LowerToByteSwap(CI);
35407 if (CI->getType()->isIntegerTy(32) &&
35408 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35409 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
35410 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
35411 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
35413 StringRef ConstraintsStr = IA->getConstraintString();
35414 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35415 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35416 if (clobbersFlagRegisters(AsmPieces))
35417 return IntrinsicLowering::LowerToByteSwap(CI);
35420 if (CI->getType()->isIntegerTy(64)) {
35421 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
35422 if (Constraints.size() >= 2 &&
35423 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
35424 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
35425 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
35426 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
35427 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
35428 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
35429 return IntrinsicLowering::LowerToByteSwap(CI);
35437 /// Given a constraint letter, return the type of constraint for this target.
35438 X86TargetLowering::ConstraintType
35439 X86TargetLowering::getConstraintType(StringRef Constraint) const {
35440 if (Constraint.size() == 1) {
35441 switch (Constraint[0]) {
35453 return C_RegisterClass;
35454 case 'k': // AVX512 masking registers.
35478 else if (Constraint.size() == 2) {
35479 switch (Constraint[0]) {
35483 switch (Constraint[1]) {
35491 return TargetLowering::getConstraintType(Constraint);
35494 /// Examine constraint type and operand type and determine a weight value.
35495 /// This object must already have been set up with the operand type
35496 /// and the current alternative constraint selected.
35497 TargetLowering::ConstraintWeight
35498 X86TargetLowering::getSingleConstraintMatchWeight(
35499 AsmOperandInfo &info, const char *constraint) const {
35500 ConstraintWeight weight = CW_Invalid;
35501 Value *CallOperandVal = info.CallOperandVal;
35502 // If we don't have a value, we can't do a match,
35503 // but allow it at the lowest weight.
35504 if (!CallOperandVal)
35506 Type *type = CallOperandVal->getType();
35507 // Look at the constraint type.
35508 switch (*constraint) {
35510 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
35522 if (CallOperandVal->getType()->isIntegerTy())
35523 weight = CW_SpecificReg;
35528 if (type->isFloatingPointTy())
35529 weight = CW_SpecificReg;
35532 if (type->isX86_MMXTy() && Subtarget.hasMMX())
35533 weight = CW_SpecificReg;
35536 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
35537 if (constraint[1] == 'k') {
35538 // Support for 'Yk' (similarly to the 'k' variant below).
35539 weight = CW_SpecificReg;
35542 // Else fall through (handle "Y" constraint).
35545 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
35546 weight = CW_Register;
35549 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
35550 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
35551 weight = CW_Register;
35554 // Enable conditional vector operations using %k<#> registers.
35555 weight = CW_SpecificReg;
35558 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
35559 if (C->getZExtValue() <= 31)
35560 weight = CW_Constant;
35564 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35565 if (C->getZExtValue() <= 63)
35566 weight = CW_Constant;
35570 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35571 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
35572 weight = CW_Constant;
35576 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35577 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
35578 weight = CW_Constant;
35582 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35583 if (C->getZExtValue() <= 3)
35584 weight = CW_Constant;
35588 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35589 if (C->getZExtValue() <= 0xff)
35590 weight = CW_Constant;
35595 if (isa<ConstantFP>(CallOperandVal)) {
35596 weight = CW_Constant;
35600 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35601 if ((C->getSExtValue() >= -0x80000000LL) &&
35602 (C->getSExtValue() <= 0x7fffffffLL))
35603 weight = CW_Constant;
35607 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35608 if (C->getZExtValue() <= 0xffffffff)
35609 weight = CW_Constant;
35616 /// Try to replace an X constraint, which matches anything, with another that
35617 /// has more specific requirements based on the type of the corresponding
35619 const char *X86TargetLowering::
35620 LowerXConstraint(EVT ConstraintVT) const {
35621 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
35622 // 'f' like normal targets.
35623 if (ConstraintVT.isFloatingPoint()) {
35624 if (Subtarget.hasSSE2())
35626 if (Subtarget.hasSSE1())
35630 return TargetLowering::LowerXConstraint(ConstraintVT);
35633 /// Lower the specified operand into the Ops vector.
35634 /// If it is invalid, don't add anything to Ops.
35635 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
35636 std::string &Constraint,
35637 std::vector<SDValue>&Ops,
35638 SelectionDAG &DAG) const {
35641 // Only support length 1 constraints for now.
35642 if (Constraint.length() > 1) return;
35644 char ConstraintLetter = Constraint[0];
35645 switch (ConstraintLetter) {
35648 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35649 if (C->getZExtValue() <= 31) {
35650 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35651 Op.getValueType());
35657 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35658 if (C->getZExtValue() <= 63) {
35659 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35660 Op.getValueType());
35666 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35667 if (isInt<8>(C->getSExtValue())) {
35668 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35669 Op.getValueType());
35675 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35676 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
35677 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
35678 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
35679 Op.getValueType());
35685 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35686 if (C->getZExtValue() <= 3) {
35687 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35688 Op.getValueType());
35694 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35695 if (C->getZExtValue() <= 255) {
35696 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35697 Op.getValueType());
35703 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35704 if (C->getZExtValue() <= 127) {
35705 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35706 Op.getValueType());
35712 // 32-bit signed value
35713 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35714 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35715 C->getSExtValue())) {
35716 // Widen to 64 bits here to get it sign extended.
35717 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
35720 // FIXME gcc accepts some relocatable values here too, but only in certain
35721 // memory models; it's complicated.
35726 // 32-bit unsigned value
35727 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35728 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35729 C->getZExtValue())) {
35730 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35731 Op.getValueType());
35735 // FIXME gcc accepts some relocatable values here too, but only in certain
35736 // memory models; it's complicated.
35740 // Literal immediates are always ok.
35741 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
35742 // Widen to 64 bits here to get it sign extended.
35743 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
35747 // In any sort of PIC mode addresses need to be computed at runtime by
35748 // adding in a register or some sort of table lookup. These can't
35749 // be used as immediates.
35750 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
35753 // If we are in non-pic codegen mode, we allow the address of a global (with
35754 // an optional displacement) to be used with 'i'.
35755 GlobalAddressSDNode *GA = nullptr;
35756 int64_t Offset = 0;
35758 // Match either (GA), (GA+C), (GA+C1+C2), etc.
35760 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
35761 Offset += GA->getOffset();
35763 } else if (Op.getOpcode() == ISD::ADD) {
35764 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35765 Offset += C->getZExtValue();
35766 Op = Op.getOperand(0);
35769 } else if (Op.getOpcode() == ISD::SUB) {
35770 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35771 Offset += -C->getZExtValue();
35772 Op = Op.getOperand(0);
35777 // Otherwise, this isn't something we can handle, reject it.
35781 const GlobalValue *GV = GA->getGlobal();
35782 // If we require an extra load to get this address, as in PIC mode, we
35783 // can't accept it.
35784 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
35787 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
35788 GA->getValueType(0), Offset);
35793 if (Result.getNode()) {
35794 Ops.push_back(Result);
35797 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
35800 /// Check if \p RC is a general purpose register class.
35801 /// I.e., GR* or one of their variant.
35802 static bool isGRClass(const TargetRegisterClass &RC) {
35803 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
35804 RC.hasSuperClassEq(&X86::GR16RegClass) ||
35805 RC.hasSuperClassEq(&X86::GR32RegClass) ||
35806 RC.hasSuperClassEq(&X86::GR64RegClass) ||
35807 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
35810 /// Check if \p RC is a vector register class.
35811 /// I.e., FR* / VR* or one of their variant.
35812 static bool isFRClass(const TargetRegisterClass &RC) {
35813 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
35814 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
35815 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
35816 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
35817 RC.hasSuperClassEq(&X86::VR512RegClass);
35820 std::pair<unsigned, const TargetRegisterClass *>
35821 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
35822 StringRef Constraint,
35824 // First, see if this is a constraint that directly corresponds to an LLVM
35826 if (Constraint.size() == 1) {
35827 // GCC Constraint Letters
35828 switch (Constraint[0]) {
35830 // TODO: Slight differences here in allocation order and leaving
35831 // RIP in the class. Do they matter any more here than they do
35832 // in the normal allocation?
35834 if (Subtarget.hasAVX512()) {
35835 // Only supported in AVX512 or later.
35836 switch (VT.SimpleTy) {
35839 return std::make_pair(0U, &X86::VK32RegClass);
35841 return std::make_pair(0U, &X86::VK16RegClass);
35843 return std::make_pair(0U, &X86::VK8RegClass);
35845 return std::make_pair(0U, &X86::VK1RegClass);
35847 return std::make_pair(0U, &X86::VK64RegClass);
35851 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
35852 if (Subtarget.is64Bit()) {
35853 if (VT == MVT::i32 || VT == MVT::f32)
35854 return std::make_pair(0U, &X86::GR32RegClass);
35855 if (VT == MVT::i16)
35856 return std::make_pair(0U, &X86::GR16RegClass);
35857 if (VT == MVT::i8 || VT == MVT::i1)
35858 return std::make_pair(0U, &X86::GR8RegClass);
35859 if (VT == MVT::i64 || VT == MVT::f64)
35860 return std::make_pair(0U, &X86::GR64RegClass);
35864 // 32-bit fallthrough
35865 case 'Q': // Q_REGS
35866 if (VT == MVT::i32 || VT == MVT::f32)
35867 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
35868 if (VT == MVT::i16)
35869 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
35870 if (VT == MVT::i8 || VT == MVT::i1)
35871 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
35872 if (VT == MVT::i64)
35873 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
35875 case 'r': // GENERAL_REGS
35876 case 'l': // INDEX_REGS
35877 if (VT == MVT::i8 || VT == MVT::i1)
35878 return std::make_pair(0U, &X86::GR8RegClass);
35879 if (VT == MVT::i16)
35880 return std::make_pair(0U, &X86::GR16RegClass);
35881 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
35882 return std::make_pair(0U, &X86::GR32RegClass);
35883 return std::make_pair(0U, &X86::GR64RegClass);
35884 case 'R': // LEGACY_REGS
35885 if (VT == MVT::i8 || VT == MVT::i1)
35886 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
35887 if (VT == MVT::i16)
35888 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
35889 if (VT == MVT::i32 || !Subtarget.is64Bit())
35890 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
35891 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
35892 case 'f': // FP Stack registers.
35893 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
35894 // value to the correct fpstack register class.
35895 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
35896 return std::make_pair(0U, &X86::RFP32RegClass);
35897 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
35898 return std::make_pair(0U, &X86::RFP64RegClass);
35899 return std::make_pair(0U, &X86::RFP80RegClass);
35900 case 'y': // MMX_REGS if MMX allowed.
35901 if (!Subtarget.hasMMX()) break;
35902 return std::make_pair(0U, &X86::VR64RegClass);
35903 case 'Y': // SSE_REGS if SSE2 allowed
35904 if (!Subtarget.hasSSE2()) break;
35907 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
35908 if (!Subtarget.hasSSE1()) break;
35909 bool VConstraint = (Constraint[0] == 'v');
35911 switch (VT.SimpleTy) {
35913 // Scalar SSE types.
35916 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
35917 return std::make_pair(0U, &X86::FR32XRegClass);
35918 return std::make_pair(0U, &X86::FR32RegClass);
35921 if (VConstraint && Subtarget.hasVLX())
35922 return std::make_pair(0U, &X86::FR64XRegClass);
35923 return std::make_pair(0U, &X86::FR64RegClass);
35924 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
35932 if (VConstraint && Subtarget.hasVLX())
35933 return std::make_pair(0U, &X86::VR128XRegClass);
35934 return std::make_pair(0U, &X86::VR128RegClass);
35942 if (VConstraint && Subtarget.hasVLX())
35943 return std::make_pair(0U, &X86::VR256XRegClass);
35944 return std::make_pair(0U, &X86::VR256RegClass);
35949 return std::make_pair(0U, &X86::VR512RegClass);
35953 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
35954 switch (Constraint[1]) {
35958 // This register class doesn't allocate k0 for masked vector operation.
35959 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
35960 switch (VT.SimpleTy) {
35963 return std::make_pair(0U, &X86::VK32WMRegClass);
35965 return std::make_pair(0U, &X86::VK16WMRegClass);
35967 return std::make_pair(0U, &X86::VK8WMRegClass);
35969 return std::make_pair(0U, &X86::VK1WMRegClass);
35971 return std::make_pair(0U, &X86::VK64WMRegClass);
35978 // Use the default implementation in TargetLowering to convert the register
35979 // constraint into a member of a register class.
35980 std::pair<unsigned, const TargetRegisterClass*> Res;
35981 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
35983 // Not found as a standard register?
35985 // Map st(0) -> st(7) -> ST0
35986 if (Constraint.size() == 7 && Constraint[0] == '{' &&
35987 tolower(Constraint[1]) == 's' &&
35988 tolower(Constraint[2]) == 't' &&
35989 Constraint[3] == '(' &&
35990 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
35991 Constraint[5] == ')' &&
35992 Constraint[6] == '}') {
35994 Res.first = X86::FP0+Constraint[4]-'0';
35995 Res.second = &X86::RFP80RegClass;
35999 // GCC allows "st(0)" to be called just plain "st".
36000 if (StringRef("{st}").equals_lower(Constraint)) {
36001 Res.first = X86::FP0;
36002 Res.second = &X86::RFP80RegClass;
36007 if (StringRef("{flags}").equals_lower(Constraint)) {
36008 Res.first = X86::EFLAGS;
36009 Res.second = &X86::CCRRegClass;
36013 // 'A' means [ER]AX + [ER]DX.
36014 if (Constraint == "A") {
36015 if (Subtarget.is64Bit()) {
36016 Res.first = X86::RAX;
36017 Res.second = &X86::GR64_ADRegClass;
36019 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
36020 "Expecting 64, 32 or 16 bit subtarget");
36021 Res.first = X86::EAX;
36022 Res.second = &X86::GR32_ADRegClass;
36029 // Otherwise, check to see if this is a register class of the wrong value
36030 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
36031 // turn into {ax},{dx}.
36032 // MVT::Other is used to specify clobber names.
36033 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
36034 return Res; // Correct type already, nothing to do.
36036 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
36037 // return "eax". This should even work for things like getting 64bit integer
36038 // registers when given an f64 type.
36039 const TargetRegisterClass *Class = Res.second;
36040 // The generic code will match the first register class that contains the
36041 // given register. Thus, based on the ordering of the tablegened file,
36042 // the "plain" GR classes might not come first.
36043 // Therefore, use a helper method.
36044 if (isGRClass(*Class)) {
36045 unsigned Size = VT.getSizeInBits();
36046 if (Size == 1) Size = 8;
36047 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
36049 Res.first = DestReg;
36050 Res.second = Size == 8 ? &X86::GR8RegClass
36051 : Size == 16 ? &X86::GR16RegClass
36052 : Size == 32 ? &X86::GR32RegClass
36053 : &X86::GR64RegClass;
36054 assert(Res.second->contains(Res.first) && "Register in register class");
36056 // No register found/type mismatch.
36058 Res.second = nullptr;
36060 } else if (isFRClass(*Class)) {
36061 // Handle references to XMM physical registers that got mapped into the
36062 // wrong class. This can happen with constraints like {xmm0} where the
36063 // target independent register mapper will just pick the first match it can
36064 // find, ignoring the required type.
36066 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36067 if (VT == MVT::f32 || VT == MVT::i32)
36068 Res.second = &X86::FR32RegClass;
36069 else if (VT == MVT::f64 || VT == MVT::i64)
36070 Res.second = &X86::FR64RegClass;
36071 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
36072 Res.second = &X86::VR128RegClass;
36073 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
36074 Res.second = &X86::VR256RegClass;
36075 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
36076 Res.second = &X86::VR512RegClass;
36078 // Type mismatch and not a clobber: Return an error;
36080 Res.second = nullptr;
36087 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
36088 const AddrMode &AM, Type *Ty,
36089 unsigned AS) const {
36090 // Scaling factors are not free at all.
36091 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
36092 // will take 2 allocations in the out of order engine instead of 1
36093 // for plain addressing mode, i.e. inst (reg1).
36095 // vaddps (%rsi,%drx), %ymm0, %ymm1
36096 // Requires two allocations (one for the load, one for the computation)
36098 // vaddps (%rsi), %ymm0, %ymm1
36099 // Requires just 1 allocation, i.e., freeing allocations for other operations
36100 // and having less micro operations to execute.
36102 // For some X86 architectures, this is even worse because for instance for
36103 // stores, the complex addressing mode forces the instruction to use the
36104 // "load" ports instead of the dedicated "store" port.
36105 // E.g., on Haswell:
36106 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
36107 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
36108 if (isLegalAddressingMode(DL, AM, Ty, AS))
36109 // Scale represents reg2 * scale, thus account for 1
36110 // as soon as we use a second register.
36111 return AM.Scale != 0;
36115 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
36116 // Integer division on x86 is expensive. However, when aggressively optimizing
36117 // for code size, we prefer to use a div instruction, as it is usually smaller
36118 // than the alternative sequence.
36119 // The exception to this is vector division. Since x86 doesn't have vector
36120 // integer division, leaving the division as-is is a loss even in terms of
36121 // size, because it will have to be scalarized, while the alternative code
36122 // sequence can be performed in vector form.
36124 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
36125 return OptSize && !VT.isVector();
36128 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
36129 if (!Subtarget.is64Bit())
36132 // Update IsSplitCSR in X86MachineFunctionInfo.
36133 X86MachineFunctionInfo *AFI =
36134 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
36135 AFI->setIsSplitCSR(true);
36138 void X86TargetLowering::insertCopiesSplitCSR(
36139 MachineBasicBlock *Entry,
36140 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36141 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36142 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36146 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36147 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36148 MachineBasicBlock::iterator MBBI = Entry->begin();
36149 for (const MCPhysReg *I = IStart; *I; ++I) {
36150 const TargetRegisterClass *RC = nullptr;
36151 if (X86::GR64RegClass.contains(*I))
36152 RC = &X86::GR64RegClass;
36154 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
36156 unsigned NewVR = MRI->createVirtualRegister(RC);
36157 // Create copy from CSR to a virtual register.
36158 // FIXME: this currently does not emit CFI pseudo-instructions, it works
36159 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36160 // nounwind. If we want to generalize this later, we may need to emit
36161 // CFI pseudo-instructions.
36162 assert(Entry->getParent()->getFunction()->hasFnAttribute(
36163 Attribute::NoUnwind) &&
36164 "Function should be nounwind in insertCopiesSplitCSR!");
36165 Entry->addLiveIn(*I);
36166 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36169 // Insert the copy-back instructions right before the terminator.
36170 for (auto *Exit : Exits)
36171 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36172 TII->get(TargetOpcode::COPY), *I)
36177 bool X86TargetLowering::supportSwiftError() const {
36178 return Subtarget.is64Bit();